diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 8611df9..0e3158b 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -55,6 +55,10 @@ "ImportPath": "github.com/gorilla/websocket", "Rev": "b6ab76f1fe9803ee1d59e7e5b2a797c1fe897ce5" }, + { + "ImportPath": "github.com/kennygrant/sanitize", + "Rev": "bf9c39a678d8e26aeee60d5fe733cad47a7a6871" + }, { "ImportPath": "github.com/qiniu/log", "Comment": "v1.0.00-2-ge002bc2", @@ -64,6 +68,14 @@ "ImportPath": "github.com/urfave/cli", "Comment": "v1.18.0-47-g168c954", "Rev": "168c95418e66e019fe17b8f4f5c45aa62ff80e23" + }, + { + "ImportPath": "golang.org/x/net/html", + "Rev": "075e191f18186a8ff2becaf64478e30f4545cdad" + }, + { + "ImportPath": "golang.org/x/net/html/atom", + "Rev": "075e191f18186a8ff2becaf64478e30f4545cdad" } ] } diff --git a/vendor/github.com/kennygrant/sanitize/.gitignore b/vendor/github.com/kennygrant/sanitize/.gitignore new file mode 100644 index 0000000..0026861 --- /dev/null +++ b/vendor/github.com/kennygrant/sanitize/.gitignore @@ -0,0 +1,22 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe diff --git a/vendor/github.com/kennygrant/sanitize/.travis.yml b/vendor/github.com/kennygrant/sanitize/.travis.yml new file mode 100644 index 0000000..4f2ee4d --- /dev/null +++ b/vendor/github.com/kennygrant/sanitize/.travis.yml @@ -0,0 +1 @@ +language: go diff --git a/vendor/github.com/kennygrant/sanitize/License-BSD.txt b/vendor/github.com/kennygrant/sanitize/License-BSD.txt new file mode 100644 index 0000000..3a7e30b --- /dev/null +++ b/vendor/github.com/kennygrant/sanitize/License-BSD.txt @@ -0,0 +1,25 @@ +License: BSD License +Copyright (c) 2013 Kenny Grant. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + * to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/vendor/github.com/kennygrant/sanitize/README.md b/vendor/github.com/kennygrant/sanitize/README.md new file mode 100644 index 0000000..7833337 --- /dev/null +++ b/vendor/github.com/kennygrant/sanitize/README.md @@ -0,0 +1,44 @@ +sanitize +======== + +Package sanitize provides functions to sanitize html and paths with go (golang). + +FUNCTIONS + + +```go +sanitize.Accents(s string) string +``` + +Accents replaces a set of accented characters with ascii equivalents. + +```go +sanitize.BaseName(s string) string +``` + +BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -. Unlike Name no attempt is made to normalise text as a path. + +```go +sanitize.HTML(s string) string +``` + +Strip html tags with a very simple parser, replace common entities, and escape < and > in the result. The result is intended to be used as plain text. + +```go +sanitize.HTMLAllowing(s string, args...[]string) (string, error) +``` + +Parse html and allow certain tags and attributes from the lists optionally specified by args - args[0] is a list of allowed tags, args[1] is a list of allowed attributes. If either is missing default sets are used. + +```go +sanitize.Name(s string) string +``` + +Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters. + +```go +sanitize.Path(s string) string +``` + +Path makes a string safe to use as an url path. + diff --git a/vendor/github.com/kennygrant/sanitize/sanitize.go b/vendor/github.com/kennygrant/sanitize/sanitize.go new file mode 100755 index 0000000..18a248b --- /dev/null +++ b/vendor/github.com/kennygrant/sanitize/sanitize.go @@ -0,0 +1,384 @@ +// Package sanitize provides functions for sanitizing text. +package sanitize + +import ( + "bytes" + "html" + "html/template" + "io" + "path" + "regexp" + "strings" + + parser "golang.org/x/net/html" +) + +var ( + ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"} + + defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img"} + + defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"} +) + +// HTMLAllowing sanitizes html, allowing some tags. +// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments. +func HTMLAllowing(s string, args ...[]string) (string, error) { + + allowedTags := defaultTags + if len(args) > 0 { + allowedTags = args[0] + } + allowedAttributes := defaultAttributes + if len(args) > 1 { + allowedAttributes = args[1] + } + + // Parse the html + tokenizer := parser.NewTokenizer(strings.NewReader(s)) + + buffer := bytes.NewBufferString("") + ignore := "" + + for { + tokenType := tokenizer.Next() + token := tokenizer.Token() + + switch tokenType { + + case parser.ErrorToken: + err := tokenizer.Err() + if err == io.EOF { + return buffer.String(), nil + } + return "", err + + case parser.StartTagToken: + + if len(ignore) == 0 && includes(allowedTags, token.Data) { + token.Attr = cleanAttributes(token.Attr, allowedAttributes) + buffer.WriteString(token.String()) + } else if includes(ignoreTags, token.Data) { + ignore = token.Data + } + + case parser.SelfClosingTagToken: + + if len(ignore) == 0 && includes(allowedTags, token.Data) { + token.Attr = cleanAttributes(token.Attr, allowedAttributes) + buffer.WriteString(token.String()) + } else if token.Data == ignore { + ignore = "" + } + + case parser.EndTagToken: + if len(ignore) == 0 && includes(allowedTags, token.Data) { + token.Attr = []parser.Attribute{} + buffer.WriteString(token.String()) + } else if token.Data == ignore { + ignore = "" + } + + case parser.TextToken: + // We allow text content through, unless ignoring this entire tag and its contents (including other tags) + if ignore == "" { + buffer.WriteString(token.String()) + } + case parser.CommentToken: + // We ignore comments by default + case parser.DoctypeToken: + // We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text + default: + // We ignore unknown token types by default + + } + + } + +} + +// HTML strips html tags, replace common entities, and escapes <>&;'" in the result. +// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated. +func HTML(s string) string { + + output := "" + + // Shortcut strings with no tags in them + if !strings.ContainsAny(s, "<>") { + output = s + } else { + + // First remove line breaks etc as these have no meaning outside html tags (except pre) + // this means pre sections will lose formatting... but will result in less uninentional paras. + s = strings.Replace(s, "\n", "", -1) + + // Then replace line breaks with newlines, to preserve that formatting + s = strings.Replace(s, "
", "\n", -1) + s = strings.Replace(s, "block. + if d != "" && d[0] == '\r' { + d = d[1:] + } + if d != "" && d[0] == '\n' { + d = d[1:] + } + } + } + d = strings.Replace(d, "\x00", "", -1) + if d == "" { + return true + } + p.reconstructActiveFormattingElements() + p.addText(d) + if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { + // There were non-whitespace characters inserted. + p.framesetOK = false + } + case StartTagToken: + switch p.tok.DataAtom { + case a.Html: + copyAttributes(p.oe[0], p.tok) + case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: + return inHeadIM(p) + case a.Body: + if len(p.oe) >= 2 { + body := p.oe[1] + if body.Type == ElementNode && body.DataAtom == a.Body { + p.framesetOK = false + copyAttributes(body, p.tok) + } + } + case a.Frameset: + if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { + // Ignore the token. + return true + } + body := p.oe[1] + if body.Parent != nil { + body.Parent.RemoveChild(body) + } + p.oe = p.oe[:1] + p.addElement() + p.im = inFramesetIM + return true + case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: + p.popUntil(buttonScope, a.P) + p.addElement() + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.popUntil(buttonScope, a.P) + switch n := p.top(); n.DataAtom { + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.oe.pop() + } + p.addElement() + case a.Pre, a.Listing: + p.popUntil(buttonScope, a.P) + p.addElement() + // The newline, if any, will be dealt with by the TextToken case. + p.framesetOK = false + case a.Form: + if p.form == nil { + p.popUntil(buttonScope, a.P) + p.addElement() + p.form = p.top() + } + case a.Li: + p.framesetOK = false + for i := len(p.oe) - 1; i >= 0; i-- { + node := p.oe[i] + switch node.DataAtom { + case a.Li: + p.oe = p.oe[:i] + case a.Address, a.Div, a.P: + continue + default: + if !isSpecialElement(node) { + continue + } + } + break + } + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Dd, a.Dt: + p.framesetOK = false + for i := len(p.oe) - 1; i >= 0; i-- { + node := p.oe[i] + switch node.DataAtom { + case a.Dd, a.Dt: + p.oe = p.oe[:i] + case a.Address, a.Div, a.P: + continue + default: + if !isSpecialElement(node) { + continue + } + } + break + } + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Plaintext: + p.popUntil(buttonScope, a.P) + p.addElement() + case a.Button: + p.popUntil(defaultScope, a.Button) + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + case a.A: + for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { + if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { + p.inBodyEndTagFormatting(a.A) + p.oe.remove(n) + p.afe.remove(n) + break + } + } + p.reconstructActiveFormattingElements() + p.addFormattingElement() + case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: + p.reconstructActiveFormattingElements() + p.addFormattingElement() + case a.Nobr: + p.reconstructActiveFormattingElements() + if p.elementInScope(defaultScope, a.Nobr) { + p.inBodyEndTagFormatting(a.Nobr) + p.reconstructActiveFormattingElements() + } + p.addFormattingElement() + case a.Applet, a.Marquee, a.Object: + p.reconstructActiveFormattingElements() + p.addElement() + p.afe = append(p.afe, &scopeMarker) + p.framesetOK = false + case a.Table: + if !p.quirks { + p.popUntil(buttonScope, a.P) + } + p.addElement() + p.framesetOK = false + p.im = inTableIM + return true + case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: + p.reconstructActiveFormattingElements() + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + if p.tok.DataAtom == a.Input { + for _, t := range p.tok.Attr { + if t.Key == "type" { + if strings.ToLower(t.Val) == "hidden" { + // Skip setting framesetOK = false + return true + } + } + } + } + p.framesetOK = false + case a.Param, a.Source, a.Track: + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + case a.Hr: + p.popUntil(buttonScope, a.P) + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + p.framesetOK = false + case a.Image: + p.tok.DataAtom = a.Img + p.tok.Data = a.Img.String() + return false + case a.Isindex: + if p.form != nil { + // Ignore the token. + return true + } + action := "" + prompt := "This is a searchable index. Enter search keywords: " + attr := []Attribute{{Key: "name", Val: "isindex"}} + for _, t := range p.tok.Attr { + switch t.Key { + case "action": + action = t.Val + case "name": + // Ignore the attribute. + case "prompt": + prompt = t.Val + default: + attr = append(attr, t) + } + } + p.acknowledgeSelfClosingTag() + p.popUntil(buttonScope, a.P) + p.parseImpliedToken(StartTagToken, a.Form, a.Form.String()) + if action != "" { + p.form.Attr = []Attribute{{Key: "action", Val: action}} + } + p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) + p.parseImpliedToken(StartTagToken, a.Label, a.Label.String()) + p.addText(prompt) + p.addChild(&Node{ + Type: ElementNode, + DataAtom: a.Input, + Data: a.Input.String(), + Attr: attr, + }) + p.oe.pop() + p.parseImpliedToken(EndTagToken, a.Label, a.Label.String()) + p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) + p.parseImpliedToken(EndTagToken, a.Form, a.Form.String()) + case a.Textarea: + p.addElement() + p.setOriginalIM() + p.framesetOK = false + p.im = textIM + case a.Xmp: + p.popUntil(buttonScope, a.P) + p.reconstructActiveFormattingElements() + p.framesetOK = false + p.addElement() + p.setOriginalIM() + p.im = textIM + case a.Iframe: + p.framesetOK = false + p.addElement() + p.setOriginalIM() + p.im = textIM + case a.Noembed, a.Noscript: + p.addElement() + p.setOriginalIM() + p.im = textIM + case a.Select: + p.reconstructActiveFormattingElements() + p.addElement() + p.framesetOK = false + p.im = inSelectIM + return true + case a.Optgroup, a.Option: + if p.top().DataAtom == a.Option { + p.oe.pop() + } + p.reconstructActiveFormattingElements() + p.addElement() + case a.Rp, a.Rt: + if p.elementInScope(defaultScope, a.Ruby) { + p.generateImpliedEndTags() + } + p.addElement() + case a.Math, a.Svg: + p.reconstructActiveFormattingElements() + if p.tok.DataAtom == a.Math { + adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) + } else { + adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) + } + adjustForeignAttributes(p.tok.Attr) + p.addElement() + p.top().Namespace = p.tok.Data + if p.hasSelfClosingToken { + p.oe.pop() + p.acknowledgeSelfClosingTag() + } + return true + case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: + // Ignore the token. + default: + p.reconstructActiveFormattingElements() + p.addElement() + } + case EndTagToken: + switch p.tok.DataAtom { + case a.Body: + if p.elementInScope(defaultScope, a.Body) { + p.im = afterBodyIM + } + case a.Html: + if p.elementInScope(defaultScope, a.Body) { + p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) + return false + } + return true + case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: + p.popUntil(defaultScope, p.tok.DataAtom) + case a.Form: + node := p.form + p.form = nil + i := p.indexOfElementInScope(defaultScope, a.Form) + if node == nil || i == -1 || p.oe[i] != node { + // Ignore the token. + return true + } + p.generateImpliedEndTags() + p.oe.remove(node) + case a.P: + if !p.elementInScope(buttonScope, a.P) { + p.parseImpliedToken(StartTagToken, a.P, a.P.String()) + } + p.popUntil(buttonScope, a.P) + case a.Li: + p.popUntil(listItemScope, a.Li) + case a.Dd, a.Dt: + p.popUntil(defaultScope, p.tok.DataAtom) + case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: + p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) + case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: + p.inBodyEndTagFormatting(p.tok.DataAtom) + case a.Applet, a.Marquee, a.Object: + if p.popUntil(defaultScope, p.tok.DataAtom) { + p.clearActiveFormattingElements() + } + case a.Br: + p.tok.Type = StartTagToken + return false + default: + p.inBodyEndTagOther(p.tok.DataAtom) + } + case CommentToken: + p.addChild(&Node{ + Type: CommentNode, + Data: p.tok.Data, + }) + } + + return true +} + +func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) { + // This is the "adoption agency" algorithm, described at + // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency + + // TODO: this is a fairly literal line-by-line translation of that algorithm. + // Once the code successfully parses the comprehensive test suite, we should + // refactor this code to be more idiomatic. + + // Steps 1-4. The outer loop. + for i := 0; i < 8; i++ { + // Step 5. Find the formatting element. + var formattingElement *Node + for j := len(p.afe) - 1; j >= 0; j-- { + if p.afe[j].Type == scopeMarkerNode { + break + } + if p.afe[j].DataAtom == tagAtom { + formattingElement = p.afe[j] + break + } + } + if formattingElement == nil { + p.inBodyEndTagOther(tagAtom) + return + } + feIndex := p.oe.index(formattingElement) + if feIndex == -1 { + p.afe.remove(formattingElement) + return + } + if !p.elementInScope(defaultScope, tagAtom) { + // Ignore the tag. + return + } + + // Steps 9-10. Find the furthest block. + var furthestBlock *Node + for _, e := range p.oe[feIndex:] { + if isSpecialElement(e) { + furthestBlock = e + break + } + } + if furthestBlock == nil { + e := p.oe.pop() + for e != formattingElement { + e = p.oe.pop() + } + p.afe.remove(e) + return + } + + // Steps 11-12. Find the common ancestor and bookmark node. + commonAncestor := p.oe[feIndex-1] + bookmark := p.afe.index(formattingElement) + + // Step 13. The inner loop. Find the lastNode to reparent. + lastNode := furthestBlock + node := furthestBlock + x := p.oe.index(node) + // Steps 13.1-13.2 + for j := 0; j < 3; j++ { + // Step 13.3. + x-- + node = p.oe[x] + // Step 13.4 - 13.5. + if p.afe.index(node) == -1 { + p.oe.remove(node) + continue + } + // Step 13.6. + if node == formattingElement { + break + } + // Step 13.7. + clone := node.clone() + p.afe[p.afe.index(node)] = clone + p.oe[p.oe.index(node)] = clone + node = clone + // Step 13.8. + if lastNode == furthestBlock { + bookmark = p.afe.index(node) + 1 + } + // Step 13.9. + if lastNode.Parent != nil { + lastNode.Parent.RemoveChild(lastNode) + } + node.AppendChild(lastNode) + // Step 13.10. + lastNode = node + } + + // Step 14. Reparent lastNode to the common ancestor, + // or for misnested table nodes, to the foster parent. + if lastNode.Parent != nil { + lastNode.Parent.RemoveChild(lastNode) + } + switch commonAncestor.DataAtom { + case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: + p.fosterParent(lastNode) + default: + commonAncestor.AppendChild(lastNode) + } + + // Steps 15-17. Reparent nodes from the furthest block's children + // to a clone of the formatting element. + clone := formattingElement.clone() + reparentChildren(clone, furthestBlock) + furthestBlock.AppendChild(clone) + + // Step 18. Fix up the list of active formatting elements. + if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { + // Move the bookmark with the rest of the list. + bookmark-- + } + p.afe.remove(formattingElement) + p.afe.insert(bookmark, clone) + + // Step 19. Fix up the stack of open elements. + p.oe.remove(formattingElement) + p.oe.insert(p.oe.index(furthestBlock)+1, clone) + } +} + +// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. +// "Any other end tag" handling from 12.2.5.5 The rules for parsing tokens in foreign content +// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign +func (p *parser) inBodyEndTagOther(tagAtom a.Atom) { + for i := len(p.oe) - 1; i >= 0; i-- { + if p.oe[i].DataAtom == tagAtom { + p.oe = p.oe[:i] + break + } + if isSpecialElement(p.oe[i]) { + break + } + } +} + +// Section 12.2.5.4.8. +func textIM(p *parser) bool { + switch p.tok.Type { + case ErrorToken: + p.oe.pop() + case TextToken: + d := p.tok.Data + if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { + // Ignore a newline at the start of a