HTML parser rewriting

Guillaume Chau · Guillaume Chau · commit 00700b280184 · 2017-04-27T17:21:25.000+02:00
diff --git a/packages/vue-component/package.js b/packages/vue-component/package.js
@@ -38,6 +38,7 @@ Package.registerBuildPlugin({
     'autoprefixer': '6.7.5',
     'vue-template-compiler': '2.2.6',
     'vue-template-es2015-compiler': '1.5.1',
+    'parse5': '3.0.2',
   }
 });
 
diff --git a/packages/vue-component/plugin/tag-scanner.js b/packages/vue-component/plugin/tag-scanner.js
@@ -1,172 +1,112 @@
+import parse5 from 'parse5'
+import { Meteor } from 'meteor/meteor'
+
 scanHtmlForTags = function scanHtmlForTags(options) {
-  const scan = new HtmlScan(options);
-  return scan.getTags();
+  try {
+    return parseHtml(options)
+  } catch (e) {
+    throwCompileError(e)
+  }
 };
 
-/**
- * Scan an HTML file for top-level tags and extract their contents. Pass them to
- * a tag handler (an object with a handleTag method)
- *
- * This is a primitive, regex-based scanner.  It scans
- * top-level tags, which are allowed to have attributes,
- * and ignores top-level HTML comments.
- */
-class HtmlScan {
-  /**
-   * Initialize and run a scan of a single file
-   * @param  {String} sourceName The filename, used in errors only
-   * @param  {String} contents   The contents of the file
-   * @param  {String[]} tagNames An array of tag names that are accepted at the
-   * top level. If any other tag is encountered, an error is thrown.
-   */
-  constructor({
-        sourceName,
-        contents,
-        tagNames
-      }) {
-    this.sourceName = sourceName;
-    this.contents = contents;
-    this.tagNames = tagNames;
-
-    this.rest = contents;
-    this.index = 0;
-
-    this.tags = [];
-
-    tagNameRegex = this.tagNames.join("|");
-    const openTagRegex = new RegExp(`^((<(${tagNameRegex})\\b)|(<!--)|(<!DOCTYPE|{{!)|$)`, "i");
-
-    while (this.rest) {
-      // skip whitespace first (for better line numbers)
-      this.advance(this.rest.match(/^\s*/)[0].length);
-
-      const match = openTagRegex.exec(this.rest);
-
-      if (! match) {
-        this.throwCompileError(`Expected one of: <${this.tagNames.join('>, <')}>`);
-      }
-
-      const matchToken = match[1];
-      const matchTokenTagName =  match[3];
-      const matchTokenComment = match[4];
-      const matchTokenUnsupported = match[5];
-
-      const tagStartIndex = this.index;
-      this.advance(match.index + match[0].length);
-
-      if (! matchToken) {
-        break; // matched $ (end of file)
-      }
+const parseHtml = Meteor.wrapAsync(({
+    sourceName,
+    contents,
+    tagNames
+  }, cb) => {
+
+  const tags = []
+
+  const parser = new parse5.SAXParser({
+    locationInfo: true,
+  })
+
+  let depth = 0
+  let info
+
+  function addTag() {
+    const tagContents = contents.substring(info.start.index, info.end.index)
+
+    const tag = {
+      tagName: info.tag.name,
+      attribs: info.tag.attrs,
+      contents: tagContents,
+      contentsStartIndex: info.start.index,
+      tagStartIndex: info.tag.index,
+      fileContents: contents,
+      sourceName: sourceName,
+      startLine: info.start.line,
+      endLine: info.end.line
+    }
 
-      if (matchTokenComment === '<!--') {
-        // top-level HTML comment
-        const commentEnd = /--\s*>/.exec(this.rest);
-        if (! commentEnd)
-          this.throwCompileError("unclosed HTML comment in template file");
-        this.advance(commentEnd.index + commentEnd[0].length);
-        continue;
-      }
+    // save the tag
+    tags.push(tag)
+  }
 
-      if (matchTokenUnsupported) {
-        switch (matchTokenUnsupported.toLowerCase()) {
-        case '<!doctype':
-          this.throwCompileError(
-            "Can't set DOCTYPE here.  (Meteor sets <!DOCTYPE html> for you)");
-        case '{{!':
-          this.throwCompileError(
-            "Can't use '{{! }}' outside a template.  Use '<!-- -->'.");
+  parser.on('startTag', (name, attrs, selfClosing, location) => {
+    if (depth === 0) {
+      if (tagNames.indexOf(name) !== -1) {
+        info = {
+          tag: {
+            name,
+            attrs: attrs.reduce((dic, attr) => {
+              const value = attr.value === '' ? true : attr.value
+              dic[attr.name] = value
+              return dic
+            }, {}),
+            index: location.startOffset,
+          },
+          start: {
+            line: location.line,
+            index: location.endOffset,
+          },
         }
 
-        this.throwCompileError();
-      }
+        if (selfClosing) {
+          info.end = {
+            line: location.line,
+            index: location.endOffset,
+          }
 
-      // otherwise, a <tag>
-      const tagName = matchTokenTagName.toLowerCase();
-      const tagAttribs = {}; // bare name -> value dict
-      const tagPartRegex = /^\s*((([a-zA-Z0-9:_-]+)\s*(=\s*(["'])(.*?)\5)?)|(>))/;
-
-      // read attributes
-      let attr;
-      while ((attr = tagPartRegex.exec(this.rest))) {
-        const attrToken = attr[1];
-        const attrKey = attr[3];
-        let attrValue = attr[6];
-        this.advance(attr.index + attr[0].length);
-
-        if (attrToken === '>') {
-          break;
+          addTag()
+        } else {
+          depth++
         }
-
-        // XXX we don't HTML unescape the attribute value
-        // (e.g. to allow "abcd&quot;efg") or protect against
-        // collisions with methods of tagAttribs (e.g. for
-        // a property named toString)
-        attrValue = attrValue && attrValue.match(/^\s*([\s\S]*?)\s*$/)[1]; // trim
-        tagAttribs[attrKey] = attrValue;
-      }
-
-      if (! attr) { // didn't end on '>'
-        this.throwCompileError(`Parse error in tag ${tagName}`);
       }
-
-      // find </tag>
-      const end = (new RegExp('</'+tagName+'\\s*>', 'i')).exec(this.rest);
-      if (! end) {
-        this.throwCompileError("unclosed <"+tagName+">");
-      }
-
-      const tagContents = this.rest.slice(0, end.index);
-      const contentsStartIndex = this.index;
-
-      // trim the tag contents.
-      // this is a courtesy and is also relied on by some unit tests.
-      var m = tagContents.match(/^([ \t\r\n]*)([\s\S]*?)[ \t\r\n]*$/);
-      const trimmedContentsStartIndex = contentsStartIndex + m[1].length;
-      const trimmedTagContents = m[2];
-
-      const tag = {
-        tagName: tagName,
-        attribs: tagAttribs,
-        contents: trimmedTagContents,
-        contentsStartIndex: trimmedContentsStartIndex,
-        tagStartIndex: tagStartIndex,
-        fileContents: this.contents,
-        sourceName: this.sourceName
-      };
-
-      // save the tag
-      this.tags.push(tag);
-
-      // advance afterwards, so that line numbers in errors are correct
-      this.advance(end.index + end[0].length);
+    } else if (name === info.tag.name) {
+      depth ++
     }
-  }
+  })
 
-  /**
-   * Advance the parser
-   * @param  {Number} amount The amount of characters to advance
-   */
-  advance(amount) {
-    this.rest = this.rest.substring(amount);
-    this.index += amount;
-  }
-
-  throwCompileError(msg, overrideIndex) {
-    const finalIndex = (typeof overrideIndex === 'number' ? overrideIndex : this.index);
+  parser.on('endTag', (name, location) => {
+    if (depth !== 0 && name === info.tag.name) {
+      depth--
 
-    const err = new TemplatingTools.CompileError();
-    err.message = msg || "bad formatting in template file";
-    err.file = this.sourceName;
-    err.line = this.contents.substring(0, finalIndex).split('\n').length;
+      if (depth === 0) {
+        info.end = {
+          line: location.line,
+          index: location.startOffset - 1,
+        }
 
-    throw err;
-  }
+        addTag()
+      }
+    }
+  })
+
+  parser.on('end', () => {
+    if (depth !== 0) {
+      cb({
+        path: sourceName,
+        line: info.start.line,
+        tag: info.tag.name,
+        message: `Missing closing </${info.tag.name}>`,
+      }, null)
+      return
+    }
 
-  throwBodyAttrsError(msg) {
-    this.parseError(msg);
-  }
+    cb(null, tags)
+  })
 
-  getTags() {
-    return this.tags;
-  }
-}
+  parser.write(contents)
+  parser.end()
+})

Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ Package.registerBuildPlugin({`
`38`	`38`	`'autoprefixer': '6.7.5',`
`39`	`39`	`'vue-template-compiler': '2.2.6',`
`40`	`40`	`'vue-template-es2015-compiler': '1.5.1',`
	`41`	`+ 'parse5': '3.0.2',`
`41`	`42`	`}`
`42`	`43`	`});`
`43`	`44`