Skip to content
Open
65 changes: 53 additions & 12 deletions internal/inputs/htmlparser.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"strings"
"unicode"

"github.com/newrelic/nri-flex/internal/formatter"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
Expand All @@ -24,9 +25,9 @@ type Table struct {
}

// ParseToJSON parses a html fragment or whole document looking for HTML
func ParseToJSON(s []byte) (string, error) {
func ParseToJSON(s []byte, htmlAttributes map[string]string) (string, error) {

tables, err := Parse(s)
tables, err := Parse(s, htmlAttributes)
if err != nil {
return "", err
}
Expand Down Expand Up @@ -87,13 +88,14 @@ func convertTable(t *Table, i int) string {

// Parse parses a html fragment or whole document looking for HTML
// tables. It converts all cells into text, stripping away any HTML content.
func Parse(s []byte) ([]*Table, error) {
func Parse(s []byte, htmlAttributes map[string]string) ([]*Table, error) {
node, err := html.Parse(bytes.NewReader(s))
if err != nil {
return nil, err
}
tables := []*Table{}
parse(node, &tables)
var vThead = true
parse(node, &tables, vThead, htmlAttributes)
for kk, t := range tables {

tables[kk] = addMissingColumns(t)
Expand All @@ -102,14 +104,19 @@ func Parse(s []byte) ([]*Table, error) {
return tables, nil
}

func innerText(n *html.Node) string {
func innerText(n *html.Node, parseAttribute bool, htmlAttributes map[string]string) string {
if n.Type == html.TextNode {
stripResult := stripChars(n.Data)
return stripResult
}
result := ""
var result string = ""
if n.Type == html.ElementNode {
if parseAttribute {
result = parseAttributes(n.Attr, htmlAttributes)
}
}
for x := n.FirstChild; x != nil; x = x.NextSibling {
result += innerText(x)
result += innerText(x, parseAttribute, htmlAttributes)
}
return result
}
Expand Down Expand Up @@ -143,7 +150,7 @@ func containTable(n *html.Node) bool {
return result
}

func parse(n *html.Node, tables *[]*Table) {
func parse(n *html.Node, tables *[]*Table, vThead bool, htmlAttributes map[string]string) {
strip := strings.TrimSpace
switch n.DataAtom {
case atom.Table:
Expand All @@ -155,27 +162,46 @@ func parse(n *html.Node, tables *[]*Table) {
t.Attributes[at.Key] = at.Val
}
*tables = append(*tables, t)
vThead = true
case atom.Th:
t := (*tables)[len(*tables)-1]
t.Headers = append(t.Headers, strip(innerText(n)))
if vThead {
t := (*tables)[len(*tables)-1]
t.Headers = append(t.Headers, strip(innerText(n, false, htmlAttributes)))
} else {
if !containTable(n) {
t := (*tables)[len(*tables)-1]
l := len(t.Rows) - 1
t.Rows[l] = append(t.Rows[l], strip(innerText(n, true, htmlAttributes)))
return
}
t := (*tables)[len(*tables)-1]
l := len(t.Rows) - 1
// If the <td> contains <table> element, set the <td> content to "TableElement"
t.Rows[l] = append(t.Rows[l], "TableElement")
}

case atom.Tr:
t := (*tables)[len(*tables)-1]
t.Rows = append(t.Rows, []string{})
case atom.Td:
if !containTable(n) {
t := (*tables)[len(*tables)-1]
l := len(t.Rows) - 1
t.Rows[l] = append(t.Rows[l], strip(innerText(n)))
t.Rows[l] = append(t.Rows[l], strip(innerText(n, true, htmlAttributes)))
return
}
t := (*tables)[len(*tables)-1]
l := len(t.Rows) - 1
// If the <td> contains <table> element, set the <td> content to "TableElement"
t.Rows[l] = append(t.Rows[l], "TableElement")

case atom.Thead:
vThead = true
case atom.Tbody:
vThead = false
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
parse(child, tables)
parse(child, tables, vThead, htmlAttributes)
}
}

Expand Down Expand Up @@ -208,3 +234,18 @@ func addMissingColumns(t *Table) *Table {
t.Rows = rows
return t
}

func parseAttributes(input []html.Attribute, parseAttributes map[string]string) string {
var result []string
for _, attr := range input {
for key, val := range parseAttributes {
if formatter.KvFinder("regex", attr.Key, key) {
if formatter.KvFinder("regex", attr.Val, val) {
result = append(result, attr.Key+":"+attr.Val+";")
}
}
}

}
return strings.Join(result, "")
}
29 changes: 22 additions & 7 deletions internal/inputs/htmlparser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,54 +15,69 @@ func TestParseToJSON(t *testing.T) {
}

testCases := map[string]struct {
parseCfg load.API
value string
key string
expected string
parseCfg load.API
value string
htmlAttributes map[string]string
key string
expected string
}{
"SingleTable": {
parseCfg: getConfig(
true),
value: `<html><body>
<table source="myTestPage1">
<thead>
<tr><th>Heading 1</th><th>Heading 11</th><th>Heading 12</th><th>Heading 13</th><th>Heading 14</th></tr>
<tr><td>Data 11</td><td>Data 12</td></tr>
</thead>
<tr><td class="city">Data 11</td><td>Data 12</td></tr>
<tr><td>Data 21</td><td>Data 22</td></tr>
<tr><td>Data 31</td><td>Data 32</td></tr>
<tr><td>Data 41</td><td>Data 42</td></tr>

</table>
</html>`,
expected: `[{"table":[{ "Heading 1": "Data 11", "Heading 11": "Data 12", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 21", "Heading 11": "Data 22", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 31", "Heading 11": "Data 32", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 41", "Heading 11": "Data 42", "Heading 12": "", "Heading 13": "", "Heading 14": ""}], "source": "myTestPage1","Index":0 }]`,
htmlAttributes: map[string]string{
"class": ".*",
},
expected: `[{"table":[{ "Heading 1": "class:city;Data 11", "Heading 11": "Data 12", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 21", "Heading 11": "Data 22", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 31", "Heading 11": "Data 32", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 41", "Heading 11": "Data 42", "Heading 12": "", "Heading 13": "", "Heading 14": ""}], "source": "myTestPage1","Index":0 }]`,
},
"TwoTableWithAttribute": {
parseCfg: getConfig(
true),
value: `<html><body>
<table source="myTestTable1">
<thead>
<tr><th>Heading 1</th><th>Heading 11</th><th>Heading 12</th><th>Heading 13</th><th>Heading 14</th></tr>
</thead>
<tr><td>Data 11</td><td>Data 12</td></tr>
<tr><td>Data 21</td><td>Data 22</td></tr>
<tr><td>Data 31</td><td>Data 32</td></tr>
<tr><td>Data 41</td><td>Data 42</td></tr>

</table>
<p>Stuff in here</p>
<table source="myTestTable2">
<thead>
<tr><th>Heading 21</th><th>Heading 22</th></tr>
<tr><td>Data 211</td><td>Data 212</td></tr>
<tr><td>Data 221</td><td>Data 222</td></tr>
<tr><td>Data 231</td><td><span></span><span><a href="">Data 232</a></span></td></tr>
<tr><td>Data 241</td><td>Data 242</td></tr>
</thead>
</table>
</body>
</html>`,
htmlAttributes: map[string]string{
"class": ".*",
},
expected: `[{"table":[{ "Heading 1": "Data 11", "Heading 11": "Data 12", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 21", "Heading 11": "Data 22", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 31", "Heading 11": "Data 32", "Heading 12": "", "Heading 13": "", "Heading 14": ""},{ "Heading 1": "Data 41", "Heading 11": "Data 42", "Heading 12": "", "Heading 13": "", "Heading 14": ""}], "source": "myTestTable1","Index":0 },{"table":[{ "Heading 21": "Data 211", "Heading 22": "Data 212"},{ "Heading 21": "Data 221", "Heading 22": "Data 222"},{ "Heading 21": "Data 231", "Heading 22": "Data 232"},{ "Heading 21": "Data 241", "Heading 22": "Data 242"}], "source": "myTestTable2","Index":1 }]`,
},
}

for testName, testCase := range testCases {
t.Run(testName, func(t *testing.T) {

result, _ := ParseToJSON([]byte(testCase.value))
result, _ := ParseToJSON([]byte(testCase.value), testCase.htmlAttributes)
assert.Equal(t, testCase.expected, string(result))
})
}
Expand Down
4 changes: 2 additions & 2 deletions internal/inputs/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,9 @@ func RunHTTP(dataStore *[]interface{}, doLoop *bool, yml *load.Config, api load.
handleJSON(dataStore, jsonBody.Bytes(), &resp, doLoop, reqURL, nextLink, api.ReturnHeaders)
}
}
case (contentType == "text/html" || contentType == "text/html; charset=utf-8") && api.ParseHTML:
case (strings.Contains(strings.ToLower(contentType), "text/html")) && api.ParseHTML:
body, _ := ioutil.ReadAll(resp.Body)
jsonBody, err := ParseToJSON(body)
jsonBody, err := ParseToJSON(body, api.ParseHtmlAttributes)
if err != nil {
load.Logrus.WithError(err).Errorf("http: URL %v failed to convert XML to Json resp.Body", *reqURL)
} else {
Expand Down
113 changes: 57 additions & 56 deletions internal/load/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,62 +257,63 @@ type SampleMerge struct {

// API YAML Struct
type API struct {
Name string `yaml:"name"`
EventType string `yaml:"event_type"` // override eventType
Entity string `yaml:"entity"` // define a custom entity name
EntityType string `yaml:"entity_type"` // define a custom entity type (namespace)
Ingest bool `yaml:"ingest"`
Inventory map[string]string `yaml:"inventory"` // set as inventory
InventoryOnly bool `yaml:"inventory_only"` // only generate inventory data
Events map[string]string `yaml:"events"` // set as events
EventsOnly bool `yaml:"events_only"` // only generate events
Merge string `yaml:"merge"` // merge into another eventType
RunAsync bool `yaml:"run_async" ` // API block to run in Async mode when using with lookupstore
AsyncRate int `yaml:"async_rate"` //Async Request Throttle Rate
JoinKey string `yaml:"join_key"` // merge into another eventType
Prefix string `yaml:"prefix"` // prefix attribute keys
File string `yaml:"file"`
URL string `yaml:"url"`
Pagination Pagination `yaml:"pagination"`
EscapeURL bool `yaml:"escape_url"`
Prometheus Prometheus `yaml:"prometheus"`
Cache string `yaml:"cache"` // read data from datastore
Database string `yaml:"database"`
DBDriver string `yaml:"db_driver"`
DBConn string `yaml:"db_conn"`
Shell string `yaml:"shell"`
CommandsAsync bool `yaml:"commands_async"` // run commands async
Commands []Command `yaml:"commands"`
DBQueries []Command `yaml:"db_queries"`
DBAsync bool `yaml:"db_async"` // perform db queries async
Jq string `yaml:"jq"` // parse data using jq
ParseHTML bool `yaml:"parse_html"` // parse text/html content type table element to JSON
Jmx JMX `yaml:"jmx"`
IgnoreLines []int // not implemented - idea is to ignore particular lines starting from 0 of the command output
User, Pass string
Proxy string
TLSConfig TLSConfig `yaml:"tls_config"`
Timeout int
Method string
Payload string
Headers map[string]string `yaml:"headers"`
DisableParentAttr bool `yaml:"disable_parent_attr"`
StartKey []string `yaml:"start_key"` // start from a different section of the payload
StoreLookups map[string]string `yaml:"store_lookups"`
DedupeLookups []string `yaml:"dedupe_lookups"`
StoreVariables map[string]string `yaml:"store_variables"`
LazyFlatten []string `yaml:"lazy_flatten"`
SampleKeys map[string]string `yaml:"sample_keys"`
RenameSamples map[string]string `yaml:"rename_samples"` // using regex if sample has a key that matches, make that a different sample
SkipProcessing []string `yaml:"skip_processing"` // skip processing particular keys using an array of regex strings
InheritAttributes bool `yaml:"inherit_attributes"` // attempts to inherit attributes were possible
CustomAttributes map[string]string `yaml:"custom_attributes"` // set additional custom attributes
SplitObjects bool `yaml:"split_objects"` // convert object with nested objects to array
SplitArray bool `yaml:"split_array"` // convert array to samples, use SetHeader to set attribute name
LeafArray bool `yaml:"leaf_array"` // convert array element to samples when SplitArray, use SetHeader to set attribute name
Scp SCP `yaml:"scp"`
HWSigner HWSigner `yaml:"hw_signer"` // Huawei Cloud Service API signer
AliyunSigner AliyunSigner `yaml:"aliyun_signer"` // Huawei Cloud Service API signer
Name string `yaml:"name"`
EventType string `yaml:"event_type"` // override eventType
Entity string `yaml:"entity"` // define a custom entity name
EntityType string `yaml:"entity_type"` // define a custom entity type (namespace)
Ingest bool `yaml:"ingest"`
Inventory map[string]string `yaml:"inventory"` // set as inventory
InventoryOnly bool `yaml:"inventory_only"` // only generate inventory data
Events map[string]string `yaml:"events"` // set as events
EventsOnly bool `yaml:"events_only"` // only generate events
Merge string `yaml:"merge"` // merge into another eventType
RunAsync bool `yaml:"run_async" ` // API block to run in Async mode when using with lookupstore
AsyncRate int `yaml:"async_rate"` //Async Request Throttle Rate
JoinKey string `yaml:"join_key"` // merge into another eventType
Prefix string `yaml:"prefix"` // prefix attribute keys
File string `yaml:"file"`
URL string `yaml:"url"`
Pagination Pagination `yaml:"pagination"`
EscapeURL bool `yaml:"escape_url"`
Prometheus Prometheus `yaml:"prometheus"`
Cache string `yaml:"cache"` // read data from datastore
Database string `yaml:"database"`
DBDriver string `yaml:"db_driver"`
DBConn string `yaml:"db_conn"`
Shell string `yaml:"shell"`
CommandsAsync bool `yaml:"commands_async"` // run commands async
Commands []Command `yaml:"commands"`
DBQueries []Command `yaml:"db_queries"`
DBAsync bool `yaml:"db_async"` // perform db queries async
Jq string `yaml:"jq"` // parse data using jq
ParseHTML bool `yaml:"parse_html"` // parse text/html content type table element to JSON
ParseHtmlAttributes map[string]string `yaml:"parse_html_attributes"` // parse HTML attributes in table element cell match the regex settings
Jmx JMX `yaml:"jmx"`
IgnoreLines []int // not implemented - idea is to ignore particular lines starting from 0 of the command output
User, Pass string
Proxy string
TLSConfig TLSConfig `yaml:"tls_config"`
Timeout int
Method string
Payload string
Headers map[string]string `yaml:"headers"`
DisableParentAttr bool `yaml:"disable_parent_attr"`
StartKey []string `yaml:"start_key"` // start from a different section of the payload
StoreLookups map[string]string `yaml:"store_lookups"`
DedupeLookups []string `yaml:"dedupe_lookups"`
StoreVariables map[string]string `yaml:"store_variables"`
LazyFlatten []string `yaml:"lazy_flatten"`
SampleKeys map[string]string `yaml:"sample_keys"`
RenameSamples map[string]string `yaml:"rename_samples"` // using regex if sample has a key that matches, make that a different sample
SkipProcessing []string `yaml:"skip_processing"` // skip processing particular keys using an array of regex strings
InheritAttributes bool `yaml:"inherit_attributes"` // attempts to inherit attributes were possible
CustomAttributes map[string]string `yaml:"custom_attributes"` // set additional custom attributes
SplitObjects bool `yaml:"split_objects"` // convert object with nested objects to array
SplitArray bool `yaml:"split_array"` // convert array to samples, use SetHeader to set attribute name
LeafArray bool `yaml:"leaf_array"` // convert array element to samples when SplitArray, use SetHeader to set attribute name
Scp SCP `yaml:"scp"`
HWSigner HWSigner `yaml:"hw_signer"` // Huawei Cloud Service API signer
AliyunSigner AliyunSigner `yaml:"aliyun_signer"` // Huawei Cloud Service API signer
// Key manipulation
ToLower bool `yaml:"to_lower"` // convert all unicode letters mapped to their lower case.
ConvertSpace string `yaml:"convert_space"` // convert spaces to another char
Expand Down
14 changes: 9 additions & 5 deletions internal/processor/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,9 @@ func CreateMetricSets(samples []interface{}, config *load.Config, i int, mergeMe
runSampleFilterExperimental := true
// check if we should ignore this output completely
// useful when requests are made to generate a lookup, but the data is not needed
if api.IgnoreOutput {
createSample = false
currentSample["event_type"] = eventType
load.IgnoredIntegrationData = append(load.IgnoredIntegrationData, currentSample)
} else {

// else
{
// check if this contains any key pair values to filter out
excludeSample := true
// evalute sample_include_filter if sample_include_match_all_filter is not specified
Expand All @@ -150,6 +148,12 @@ func CreateMetricSets(samples []interface{}, config *load.Config, i int, mergeMe
}
}

if api.IgnoreOutput && createSample {
createSample = false
currentSample["event_type"] = eventType
load.IgnoredIntegrationData = append(load.IgnoredIntegrationData, currentSample)
}

if createSample {
RunMathCalculations(&api.Math, &currentSample)

Expand Down
Loading
Loading