diff --git a/go.mod b/go.mod index 58c65c96..d1bd6731 100644 --- a/go.mod +++ b/go.mod @@ -33,8 +33,8 @@ require ( github.com/PuerkitoBio/goquery v1.5.1 // indirect github.com/andybalholm/cascadia v1.2.0 // indirect github.com/antchfx/htmlquery v1.2.3 // indirect - github.com/antchfx/xmlquery v1.2.4 // indirect - github.com/antchfx/xpath v1.1.8 // indirect + github.com/antchfx/xmlquery v1.3.1 // indirect + github.com/antchfx/xpath v1.1.10 // indirect github.com/denisenkom/go-mssqldb v0.0.0-20190915052044-aa4949efa320 // indirect github.com/erikstmartin/go-testdb v0.0.0-20160219214506-8d10e4a1bae5 // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect diff --git a/go.sum b/go.sum index 9d50739e..4a83870d 100644 --- a/go.sum +++ b/go.sum @@ -11,11 +11,13 @@ github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5 github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= -github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4= github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= +github.com/antchfx/xmlquery v1.3.1 h1:nIKWdtnhrXtj0/IRUAAw2I7TfpHUa3zMnHvNmPXFg+w= +github.com/antchfx/xmlquery v1.3.1/go.mod h1:64w0Xesg2sTaawIdNqMB+7qaW/bSqkQm+ssPaCMWNnc= github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= -github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk= github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.1.10 h1:cJ0pOvEdN/WvYXxvRrzQH9x5QWKpzHacYO8qzCcDYAg= +github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/apokalyptik/cfg v0.0.0-20160401174707-703f89116901 h1:0yiOsd1b8gHxSfR/1ROHSAjKNrhoXNY3CVcPPLl/rp0= github.com/apokalyptik/cfg v0.0.0-20160401174707-703f89116901/go.mod h1:5a6I8lR9NZj4USqYDHMR/0eZgjYivY+a1syWE0NO1po= github.com/bearcherian/rollzap v1.0.2 h1:Q74bycIl4F4VruPdcc7Py5zpByKaobUGk4PwVymVmUg= @@ -200,6 +202,7 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20221002022538-bcab6841153b h1:6e93nYa3hNqAvLr0pD4PN1fFS+gKzp2zAXqrnTCstqU= diff --git a/vendor/github.com/antchfx/xmlquery/.travis.yml b/vendor/github.com/antchfx/xmlquery/.travis.yml index b99bee6a..c4d55b39 100644 --- a/vendor/github.com/antchfx/xmlquery/.travis.yml +++ b/vendor/github.com/antchfx/xmlquery/.travis.yml @@ -12,4 +12,4 @@ install: - go get github.com/golang/groupcache script: - - $HOME/gopath/bin/goveralls -service=travis-ci \ No newline at end of file + - $HOME/gopath/bin/goveralls -service=travis-ci diff --git a/vendor/github.com/antchfx/xmlquery/README.md b/vendor/github.com/antchfx/xmlquery/README.md index 8b3c35ee..410ae444 100644 --- a/vendor/github.com/antchfx/xmlquery/README.md +++ b/vendor/github.com/antchfx/xmlquery/README.md @@ -15,6 +15,9 @@ Overview Change Logs === +2020-08-?? +- Add XML stream loading and parsing support. + 2019-11-11 - Add XPath query caching. @@ -48,26 +51,58 @@ if err != nil { } ``` -#### Parse a XML from URL. +#### Parse an XML from URL. ```go doc, err := xmlquery.LoadURL("http://www.example.com/sitemap.xml") ``` -#### Parse a XML from string. +#### Parse an XML from string. ```go s := `` doc, err := xmlquery.Parse(strings.NewReader(s)) ``` -#### Parse a XML from io.Reader. +#### Parse an XML from io.Reader. ```go f, err := os.Open("../books.xml") doc, err := xmlquery.Parse(f) ``` +#### Parse an XML in a stream fashion (simple case without element filtering). + +```go +f, err := os.Open("../books.xml") +p, err := xmlquery.CreateStreamParser(f, "/bookstore/book") +for { + n, err := p.Read() + if err == io.EOF { + break + } + if err != nil { + ... + } +} +``` + +#### Parse an XML in a stream fashion (simple case advanced element filtering). + +```go +f, err := os.Open("../books.xml") +p, err := xmlquery.CreateStreamParser(f, "/bookstore/book", "/bookstore/book[price>=10]") +for { + n, err := p.Read() + if err == io.EOF { + break + } + if err != nil { + ... + } +} +``` + #### Find authors of all books in the bookstore. ```go @@ -210,11 +245,11 @@ func main(){ List of supported XPath query packages === -|Name |Description | -|--------------------------|----------------| -|[htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document| -|[xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document| -|[jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document| +| Name | Description | +| ------------------------------------------------- | ----------------------------------------- | +| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document | +| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document | +| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document | Questions === diff --git a/vendor/github.com/antchfx/xmlquery/node.go b/vendor/github.com/antchfx/xmlquery/node.go index c57b0ed1..e0537482 100644 --- a/vendor/github.com/antchfx/xmlquery/node.go +++ b/vendor/github.com/antchfx/xmlquery/node.go @@ -3,13 +3,8 @@ package xmlquery import ( "bytes" "encoding/xml" - "errors" "fmt" - "io" - "net/http" "strings" - - "golang.org/x/net/html/charset" ) // A NodeType is the type of a Node. @@ -146,7 +141,8 @@ func (n *Node) OutputXML(self bool) string { return buf.String() } -func addAttr(n *Node, key, val string) { +// AddAttr adds a new attribute specified by 'key' and 'val' to a node 'n'. +func AddAttr(n *Node, key, val string) { var attr xml.Attr if i := strings.Index(key, ":"); i > 0 { attr = xml.Attr{ @@ -163,10 +159,13 @@ func addAttr(n *Node, key, val string) { n.Attr = append(n.Attr, attr) } -func addChild(parent, n *Node) { +// AddChild adds a new node 'n' to a node 'parent' as its last child. +func AddChild(parent, n *Node) { n.Parent = parent + n.NextSibling = nil if parent.FirstChild == nil { parent.FirstChild = n + n.PrevSibling = nil } else { parent.LastChild.NextSibling = n n.PrevSibling = parent.LastChild @@ -175,153 +174,48 @@ func addChild(parent, n *Node) { parent.LastChild = n } -func addSibling(sibling, n *Node) { +// AddSibling adds a new node 'n' as a sibling of a given node 'sibling'. +// Note it is not necessarily true that the new node 'n' would be added +// immediately after 'sibling'. If 'sibling' isn't the last child of its +// parent, then the new node 'n' will be added at the end of the sibling +// chain of their parent. +func AddSibling(sibling, n *Node) { for t := sibling.NextSibling; t != nil; t = t.NextSibling { sibling = t } n.Parent = sibling.Parent sibling.NextSibling = n n.PrevSibling = sibling + n.NextSibling = nil if sibling.Parent != nil { sibling.Parent.LastChild = n } } -// LoadURL loads the XML document from the specified URL. -func LoadURL(url string) (*Node, error) { - resp, err := http.Get(url) - if err != nil { - return nil, err +// RemoveFromTree removes a node and its subtree from the document +// tree it is in. If the node is the root of the tree, then it's no-op. +func RemoveFromTree(n *Node) { + if n.Parent == nil { + return } - defer resp.Body.Close() - return parse(resp.Body) -} - -func parse(r io.Reader) (*Node, error) { - var ( - decoder = xml.NewDecoder(r) - doc = &Node{Type: DocumentNode} - space2prefix = make(map[string]string) - level = 0 - ) - // http://www.w3.org/XML/1998/namespace is bound by definition to the prefix xml. - space2prefix["http://www.w3.org/XML/1998/namespace"] = "xml" - decoder.CharsetReader = charset.NewReaderLabel - prev := doc - for { - tok, err := decoder.Token() - switch { - case err == io.EOF: - goto quit - case err != nil: - return nil, err + if n.Parent.FirstChild == n { + if n.Parent.LastChild == n { + n.Parent.FirstChild = nil + n.Parent.LastChild = nil + } else { + n.Parent.FirstChild = n.NextSibling + n.NextSibling.PrevSibling = nil } - - switch tok := tok.(type) { - case xml.StartElement: - if level == 0 { - // mising XML declaration - node := &Node{Type: DeclarationNode, Data: "xml", level: 1} - addChild(prev, node) - level = 1 - prev = node - } - // https://www.w3.org/TR/xml-names/#scoping-defaulting - for _, att := range tok.Attr { - if att.Name.Local == "xmlns" { - space2prefix[att.Value] = "" - } else if att.Name.Space == "xmlns" { - space2prefix[att.Value] = att.Name.Local - } - } - - if tok.Name.Space != "" { - if _, found := space2prefix[tok.Name.Space]; !found { - return nil, errors.New("xmlquery: invalid XML document, namespace is missing") - } - } - - for i := 0; i < len(tok.Attr); i++ { - att := &tok.Attr[i] - if prefix, ok := space2prefix[att.Name.Space]; ok { - att.Name.Space = prefix - } - } - - node := &Node{ - Type: ElementNode, - Data: tok.Name.Local, - Prefix: space2prefix[tok.Name.Space], - NamespaceURI: tok.Name.Space, - Attr: tok.Attr, - level: level, - } - //fmt.Println(fmt.Sprintf("start > %s : %d", node.Data, level)) - if level == prev.level { - addSibling(prev, node) - } else if level > prev.level { - addChild(prev, node) - } else if level < prev.level { - for i := prev.level - level; i > 1; i-- { - prev = prev.Parent - } - addSibling(prev.Parent, node) - } - prev = node - level++ - case xml.EndElement: - level-- - case xml.CharData: - node := &Node{Type: CharDataNode, Data: string(tok), level: level} - if level == prev.level { - addSibling(prev, node) - } else if level > prev.level { - addChild(prev, node) - } else if level < prev.level { - for i := prev.level - level; i > 1; i-- { - prev = prev.Parent - } - addSibling(prev.Parent, node) - } - case xml.Comment: - node := &Node{Type: CommentNode, Data: string(tok), level: level} - if level == prev.level { - addSibling(prev, node) - } else if level > prev.level { - addChild(prev, node) - } else if level < prev.level { - for i := prev.level - level; i > 1; i-- { - prev = prev.Parent - } - addSibling(prev.Parent, node) - } - case xml.ProcInst: // Processing Instruction - if prev.Type != DeclarationNode { - level++ - } - node := &Node{Type: DeclarationNode, Data: tok.Target, level: level} - pairs := strings.Split(string(tok.Inst), " ") - for _, pair := range pairs { - pair = strings.TrimSpace(pair) - if i := strings.Index(pair, "="); i > 0 { - addAttr(node, pair[:i], strings.Trim(pair[i+1:], `"`)) - } - } - if level == prev.level { - addSibling(prev, node) - } else if level > prev.level { - addChild(prev, node) - } - prev = node - case xml.Directive: + } else { + if n.Parent.LastChild == n { + n.Parent.LastChild = n.PrevSibling + n.PrevSibling.NextSibling = nil + } else { + n.PrevSibling.NextSibling = n.NextSibling + n.NextSibling.PrevSibling = n.PrevSibling } - } -quit: - return doc, nil -} - -// Parse returns the parse tree for the XML from the given Reader. -func Parse(r io.Reader) (*Node, error) { - return parse(r) + n.Parent = nil + n.PrevSibling = nil + n.NextSibling = nil } diff --git a/vendor/github.com/antchfx/xmlquery/parse.go b/vendor/github.com/antchfx/xmlquery/parse.go new file mode 100644 index 00000000..853ea018 --- /dev/null +++ b/vendor/github.com/antchfx/xmlquery/parse.go @@ -0,0 +1,311 @@ +package xmlquery + +import ( + "encoding/xml" + "errors" + "fmt" + "io" + "net/http" + "strings" + + "github.com/antchfx/xpath" + "golang.org/x/net/html/charset" +) + +// LoadURL loads the XML document from the specified URL. +func LoadURL(url string) (*Node, error) { + resp, err := http.Get(url) + if err != nil { + return nil, err + } + defer resp.Body.Close() + // Checking the HTTP Content-Type value from the response headers.(#39) + v := strings.ToLower(resp.Header.Get("Content-Type")) + if v == "text/xml" || v == "application/xml" { + return Parse(resp.Body) + } + return nil, fmt.Errorf("invalid XML document(%s)", v) +} + +// Parse returns the parse tree for the XML from the given Reader. +func Parse(r io.Reader) (*Node, error) { + p := createParser(r) + for { + _, err := p.parse() + if err == io.EOF { + return p.doc, nil + } + if err != nil { + return nil, err + } + } +} + +type parser struct { + decoder *xml.Decoder + doc *Node + space2prefix map[string]string + level int + prev *Node + streamElementXPath *xpath.Expr // Under streaming mode, this specifies the xpath to the target element node(s). + streamElementFilter *xpath.Expr // If specified, it provides a futher filtering on the target element. + streamNode *Node // Need to remmeber the last target node So we can clean it up upon next Read() call. + streamNodePrev *Node // Need to remember target node's prev so upon target node removal, we can restore correct prev. +} + +func createParser(r io.Reader) *parser { + p := &parser{ + decoder: xml.NewDecoder(r), + doc: &Node{Type: DocumentNode}, + space2prefix: make(map[string]string), + level: 0, + } + // http://www.w3.org/XML/1998/namespace is bound by definition to the prefix xml. + p.space2prefix["http://www.w3.org/XML/1998/namespace"] = "xml" + p.decoder.CharsetReader = charset.NewReaderLabel + p.prev = p.doc + return p +} + +func (p *parser) parse() (*Node, error) { + var streamElementNodeCounter int + + for { + tok, err := p.decoder.Token() + if err != nil { + return nil, err + } + + switch tok := tok.(type) { + case xml.StartElement: + if p.level == 0 { + // mising XML declaration + node := &Node{Type: DeclarationNode, Data: "xml", level: 1} + AddChild(p.prev, node) + p.level = 1 + p.prev = node + } + // https://www.w3.org/TR/xml-names/#scoping-defaulting + for _, att := range tok.Attr { + if att.Name.Local == "xmlns" { + p.space2prefix[att.Value] = "" + } else if att.Name.Space == "xmlns" { + p.space2prefix[att.Value] = att.Name.Local + } + } + + if tok.Name.Space != "" { + if _, found := p.space2prefix[tok.Name.Space]; !found { + return nil, errors.New("xmlquery: invalid XML document, namespace is missing") + } + } + + for i := 0; i < len(tok.Attr); i++ { + att := &tok.Attr[i] + if prefix, ok := p.space2prefix[att.Name.Space]; ok { + att.Name.Space = prefix + } + } + + node := &Node{ + Type: ElementNode, + Data: tok.Name.Local, + Prefix: p.space2prefix[tok.Name.Space], + NamespaceURI: tok.Name.Space, + Attr: tok.Attr, + level: p.level, + } + //fmt.Println(fmt.Sprintf("start > %s : %d", node.Data, node.level)) + if p.level == p.prev.level { + AddSibling(p.prev, node) + } else if p.level > p.prev.level { + AddChild(p.prev, node) + } else if p.level < p.prev.level { + for i := p.prev.level - p.level; i > 1; i-- { + p.prev = p.prev.Parent + } + AddSibling(p.prev.Parent, node) + } + // If we're in the streaming mode, we need to remember the node if it is the target node + // so that when we finish processing the node's EndElement, we know how/what to return to + // caller. Also we need to remove the target node from the tree upon next Read() call so + // memory doesn't grow unbounded. + if p.streamElementXPath != nil { + if p.streamNode == nil { + if QuerySelector(p.doc, p.streamElementXPath) != nil { + p.streamNode = node + p.streamNodePrev = p.prev + streamElementNodeCounter = 1 + } + } else { + streamElementNodeCounter++ + } + } + p.prev = node + p.level++ + case xml.EndElement: + p.level-- + // If we're in streaming mode, and we already have a potential streaming + // target node identified (p.streamNode != nil) then we need to check if + // this is the real one we want to return to caller. + if p.streamNode != nil { + streamElementNodeCounter-- + if streamElementNodeCounter == 0 { + // Now we know this element node is the at least passing the initial + // p.streamElementXPath check and is a potential target node candidate. + // We need to have 1 more check with p.streamElementFilter (if given) to + // ensure it is really the element node we want. + // The reason we need a two-step check process is because the following + // situation: + // b1 + // And say the p.streamElementXPath = "/AAA/BBB[. != 'b1']". Now during + // xml.StartElement time, the node is still empty, so it will pass + // the p.streamElementXPath check. However, eventually we know this + // shouldn't be returned to the caller. Having a second more fine-grained + // filter check ensures that. So in this case, the caller should really + // setup the stream parser with: + // streamElementXPath = "/AAA/BBB[" + // streamElementFilter = "/AAA/BBB[. != 'b1']" + if p.streamElementFilter == nil || QuerySelector(p.doc, p.streamElementFilter) != nil { + return p.streamNode, nil + } + // otherwise, this isn't our target node, clean things up. + // note we also remove the underlying *Node from the node tree, to prevent + // future stream node candidate selection error. + RemoveFromTree(p.streamNode) + p.prev = p.streamNodePrev + p.streamNode = nil + p.streamNodePrev = nil + } + } + case xml.CharData: + node := &Node{Type: CharDataNode, Data: string(tok), level: p.level} + if p.level == p.prev.level { + AddSibling(p.prev, node) + } else if p.level > p.prev.level { + AddChild(p.prev, node) + } else if p.level < p.prev.level { + for i := p.prev.level - p.level; i > 1; i-- { + p.prev = p.prev.Parent + } + AddSibling(p.prev.Parent, node) + } + case xml.Comment: + node := &Node{Type: CommentNode, Data: string(tok), level: p.level} + if p.level == p.prev.level { + AddSibling(p.prev, node) + } else if p.level > p.prev.level { + AddChild(p.prev, node) + } else if p.level < p.prev.level { + for i := p.prev.level - p.level; i > 1; i-- { + p.prev = p.prev.Parent + } + AddSibling(p.prev.Parent, node) + } + case xml.ProcInst: // Processing Instruction + if p.prev.Type != DeclarationNode { + p.level++ + } + node := &Node{Type: DeclarationNode, Data: tok.Target, level: p.level} + pairs := strings.Split(string(tok.Inst), " ") + for _, pair := range pairs { + pair = strings.TrimSpace(pair) + if i := strings.Index(pair, "="); i > 0 { + AddAttr(node, pair[:i], strings.Trim(pair[i+1:], `"`)) + } + } + if p.level == p.prev.level { + AddSibling(p.prev, node) + } else if p.level > p.prev.level { + AddChild(p.prev, node) + } + p.prev = node + case xml.Directive: + } + } +} + +// StreamParser enables loading and parsing an XML document in a streaming fashion. +type StreamParser struct { + p *parser +} + +// CreateStreamParser creates a StreamParser. Argument streamElementXPath is required. +// Argument streamElementFilter is optional and should only be used in advanced scenarios. +// +// Scenario 1: simple case: +// xml := `b1b2` +// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB") +// if err != nil { +// panic(err) +// } +// for { +// n, err := sp.Read() +// if err != nil { +// break +// } +// fmt.Println(n.OutputXML(true)) +// } +// Output will be: +// b1 +// b2 +// +// Scenario 2: advanced case: +// xml := `b1b2` +// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB", "/AAA/BBB[. != 'b1']") +// if err != nil { +// panic(err) +// } +// for { +// n, err := sp.Read() +// if err != nil { +// break +// } +// fmt.Println(n.OutputXML(true)) +// } +// Output will be: +// b2 +// +// As the argument names indicate, streamElementXPath should be used for providing xpath query pointing +// to the target element node only, no extra filtering on the element itself or its children; while +// streamElementFilter, if needed, can provide additional filtering on the target element and its children. +// +// CreateStreamParser returns error if either streamElementXPath or streamElementFilter, if provided, cannot +// be successfully parsed and compiled into a valid xpath query. +func CreateStreamParser(r io.Reader, streamElementXPath string, streamElementFilter ...string) (*StreamParser, error) { + elemXPath, err := getQuery(streamElementXPath) + if err != nil { + return nil, fmt.Errorf("invalid streamElementXPath '%s', err: %s", streamElementXPath, err.Error()) + } + elemFilter := (*xpath.Expr)(nil) + if len(streamElementFilter) > 0 { + elemFilter, err = getQuery(streamElementFilter[0]) + if err != nil { + return nil, fmt.Errorf("invalid streamElementFilter '%s', err: %s", streamElementFilter[0], err.Error()) + } + } + sp := &StreamParser{ + p: createParser(r), + } + sp.p.streamElementXPath = elemXPath + sp.p.streamElementFilter = elemFilter + return sp, nil +} + +// Read returns a target node that satisifies the XPath specified by caller at StreamParser creation +// time. If there is no more satisifying target node after reading the rest of the XML document, io.EOF +// will be returned. At any time, any XML parsing error encountered, the error will be returned and +// the stream parsing is stopped. Calling Read() after an error is returned (including io.EOF) is not +// allowed the behavior will be undefined. Also note, due to the streaming nature, calling Read() will +// automatically remove any previous target node(s) from the document tree. +func (sp *StreamParser) Read() (*Node, error) { + // Because this is a streaming read, we need to release/remove last + // target node from the node tree to free up memory. + if sp.p.streamNode != nil { + RemoveFromTree(sp.p.streamNode) + sp.p.prev = sp.p.streamNodePrev + sp.p.streamNode = nil + sp.p.streamNodePrev = nil + } + return sp.p.parse() +} diff --git a/vendor/github.com/antchfx/xpath/func.go b/vendor/github.com/antchfx/xpath/func.go index 3873e33f..bcfee55b 100644 --- a/vendor/github.com/antchfx/xpath/func.go +++ b/vendor/github.com/antchfx/xpath/func.go @@ -4,11 +4,26 @@ import ( "errors" "fmt" "math" - "regexp" "strconv" "strings" + "sync" + "unicode" ) +// Defined an interface of stringBuilder that compatible with +// strings.Builder(go 1.10) and bytes.Buffer(< go 1.10) +type stringBuilder interface { + WriteRune(r rune) (n int, err error) + WriteString(s string) (int, error) + Reset() + Grow(n int) + String() string +} + +var builderPool = sync.Pool{New: func() interface{} { + return newStringBuilder() +}} + // The XPath function list. func predicate(q query) func(NodeNavigator) bool { @@ -58,6 +73,7 @@ func lastFunc(q query, t iterator) interface{} { // countFunc is a XPath Node Set functions count(node-set). func countFunc(q query, t iterator) interface{} { var count = 0 + q = functionArgs(q) test := predicate(q) switch typ := q.Evaluate(t).(type) { case query: @@ -73,7 +89,7 @@ func countFunc(q query, t iterator) interface{} { // sumFunc is a XPath Node Set functions sum(node-set). func sumFunc(q query, t iterator) interface{} { var sum float64 - switch typ := q.Evaluate(t).(type) { + switch typ := functionArgs(q).Evaluate(t).(type) { case query: for node := typ.Select(t); node != nil; node = typ.Select(t) { if v, err := strconv.ParseFloat(node.Value(), 64); err == nil { @@ -116,19 +132,19 @@ func asNumber(t iterator, o interface{}) float64 { // ceilingFunc is a XPath Node Set functions ceiling(node-set). func ceilingFunc(q query, t iterator) interface{} { - val := asNumber(t, q.Evaluate(t)) + val := asNumber(t, functionArgs(q).Evaluate(t)) return math.Ceil(val) } // floorFunc is a XPath Node Set functions floor(node-set). func floorFunc(q query, t iterator) interface{} { - val := asNumber(t, q.Evaluate(t)) + val := asNumber(t, functionArgs(q).Evaluate(t)) return math.Floor(val) } // roundFunc is a XPath Node Set functions round(node-set). func roundFunc(q query, t iterator) interface{} { - val := asNumber(t, q.Evaluate(t)) + val := asNumber(t, functionArgs(q).Evaluate(t)) //return math.Round(val) return round(val) } @@ -201,7 +217,7 @@ func asBool(t iterator, v interface{}) bool { case *NodeIterator: return v.MoveNext() case bool: - return bool(v) + return v case float64: return v != 0 case string: @@ -239,19 +255,19 @@ func asString(t iterator, v interface{}) string { // booleanFunc is a XPath functions boolean([node-set]). func booleanFunc(q query, t iterator) interface{} { - v := q.Evaluate(t) + v := functionArgs(q).Evaluate(t) return asBool(t, v) } // numberFunc is a XPath functions number([node-set]). func numberFunc(q query, t iterator) interface{} { - v := q.Evaluate(t) + v := functionArgs(q).Evaluate(t) return asNumber(t, v) } // stringFunc is a XPath functions string([node-set]). func stringFunc(q query, t iterator) interface{} { - v := q.Evaluate(t) + v := functionArgs(q).Evaluate(t) return asString(t, v) } @@ -338,15 +354,10 @@ func containsFunc(arg1, arg2 query) func(query, iterator) interface{} { } } -var ( - regnewline = regexp.MustCompile(`[\r\n\t]`) - regseqspace = regexp.MustCompile(`\s{2,}`) -) - // normalizespaceFunc is XPath functions normalize-space(string?) func normalizespaceFunc(q query, t iterator) interface{} { var m string - switch typ := q.Evaluate(t).(type) { + switch typ := functionArgs(q).Evaluate(t).(type) { case string: m = typ case query: @@ -356,10 +367,26 @@ func normalizespaceFunc(q query, t iterator) interface{} { } m = node.Value() } - m = strings.TrimSpace(m) - m = regnewline.ReplaceAllString(m, " ") - m = regseqspace.ReplaceAllString(m, " ") - return m + var b = builderPool.Get().(stringBuilder) + b.Grow(len(m)) + + runeStr := []rune(strings.TrimSpace(m)) + l := len(runeStr) + for i := range runeStr { + r := runeStr[i] + isSpace := unicode.IsSpace(r) + if !(isSpace && (i+1 < l && unicode.IsSpace(runeStr[i+1]))) { + if isSpace { + r = ' ' + } + b.WriteRune(r) + } + } + result := b.String() + b.Reset() + builderPool.Put(b) + + return result } // substringFunc is XPath functions substring function returns a part of a given string. @@ -466,7 +493,7 @@ func translateFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} { src := asString(t, functionArgs(arg2).Evaluate(t)) dst := asString(t, functionArgs(arg3).Evaluate(t)) - var replace []string + replace := make([]string, 0, len(src)) for i, s := range src { d := "" if i < len(dst) { @@ -491,7 +518,7 @@ func replaceFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} { // notFunc is XPATH functions not(expression) function operation. func notFunc(q query, t iterator) interface{} { - switch v := q.Evaluate(t).(type) { + switch v := functionArgs(q).Evaluate(t).(type) { case bool: return !v case query: @@ -507,20 +534,25 @@ func notFunc(q query, t iterator) interface{} { // concat( string1 , string2 [, stringn]* ) func concatFunc(args ...query) func(query, iterator) interface{} { return func(q query, t iterator) interface{} { - var a []string + b := builderPool.Get().(stringBuilder) for _, v := range args { v = functionArgs(v) + switch v := v.Evaluate(t).(type) { case string: - a = append(a, v) + b.WriteString(v) case query: node := v.Select(t) if node != nil { - a = append(a, node.Value()) + b.WriteString(node.Value()) } } } - return strings.Join(a, "") + result := b.String() + b.Reset() + builderPool.Put(b) + + return result } } diff --git a/vendor/github.com/antchfx/xpath/func_go110.go b/vendor/github.com/antchfx/xpath/func_go110.go index 500880fa..6df30d3d 100644 --- a/vendor/github.com/antchfx/xpath/func_go110.go +++ b/vendor/github.com/antchfx/xpath/func_go110.go @@ -2,8 +2,15 @@ package xpath -import "math" +import ( + "math" + "strings" +) func round(f float64) int { return int(math.Round(f)) } + +func newStringBuilder() stringBuilder{ + return &strings.Builder{} +} diff --git a/vendor/github.com/antchfx/xpath/func_pre_go110.go b/vendor/github.com/antchfx/xpath/func_pre_go110.go index 043616b3..335141f7 100644 --- a/vendor/github.com/antchfx/xpath/func_pre_go110.go +++ b/vendor/github.com/antchfx/xpath/func_pre_go110.go @@ -2,7 +2,10 @@ package xpath -import "math" +import ( + "bytes" + "math" +) // math.Round() is supported by Go 1.10+, // This method just compatible for version <1.10. @@ -13,3 +16,7 @@ func round(f float64) int { } return int(f + math.Copysign(0.5, f)) } + +func newStringBuilder() stringBuilder { + return &bytes.Buffer{} +} diff --git a/vendor/github.com/antchfx/xpath/operator.go b/vendor/github.com/antchfx/xpath/operator.go index f9c10bcd..8c2f31f8 100644 --- a/vendor/github.com/antchfx/xpath/operator.go +++ b/vendor/github.com/antchfx/xpath/operator.go @@ -173,7 +173,7 @@ func cmpNodeSetNodeSet(t iterator, op string, m, n interface{}) bool { if y == nil { return false } - return cmpStringStringF(op,x.Value(),y.Value()) + return cmpStringStringF(op, x.Value(), y.Value()) } func cmpStringNumeric(t iterator, op string, m, n interface{}) bool { diff --git a/vendor/modules.txt b/vendor/modules.txt index 90a2b316..062eea6f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -10,10 +10,10 @@ github.com/andybalholm/cascadia # github.com/antchfx/htmlquery v1.2.3 ## explicit; go 1.14 github.com/antchfx/htmlquery -# github.com/antchfx/xmlquery v1.2.4 +# github.com/antchfx/xmlquery v1.3.1 ## explicit; go 1.14 github.com/antchfx/xmlquery -# github.com/antchfx/xpath v1.1.8 +# github.com/antchfx/xpath v1.1.10 ## explicit github.com/antchfx/xpath # github.com/apokalyptik/cfg v0.0.0-20160401174707-703f89116901