Saturday, January 7, 2017

EBNF Grammer for Parsing Chrome Bookmarks

The bookmarks html exported by Chrome is not a valid html. It has different rules with a different DTD. Here is an ANTLR 4 grammar for parsing the bookmarks with support for unicode characters in bookmark names.
grammar Bookmarks;

document : prolog? misc* meta* misc* dl misc*;

prolog : DTD;

misc
: COMMENT
| S
;

meta
: '<' TEXT '>' TEXT '</' TEXT '>'
| '<' TEXT attribute* '>'
;

dl : '<' TEXT '><' TEXT '>' misc* dt* misc* '</' TEXT '><' TEXT '>';

dt
: '<' TEXT '><' tag attribute* '>' content '</' tag '>'
| '<' TEXT '><' tag attribute* '></' tag '>'
| dl
;

attribute
: attributeName '=' attributeValue
| S
;

tag
: H3
| TEXT
;

attributeName : TEXT;

attributeValue : VAL;

content : TEXT+;

DTD : '<!'.*?'>';

COMMENT : '<!--' .*? '-->' S;

H3 : 'H3';

VAL : '"'.*?'"';

TEXT : [A-Za-z0-9:\/\.@\-_;\s*]+ | NameChar+;

fragment
NameChar
: NameStartChar
| '0'..'9'
| '_'
| '\u00B7'
| '\u0300'..'\u036F'
| '\u203F'..'\u2040'
;

fragment
NameStartChar
: 'A'..'Z' | 'a'..'z'
| '\u00C0'..'\u00D6'
| '\u00D8'..'\u00F6'
| '\u00F8'..'\u02FF'
| '\u0370'..'\u037D'
| '\u037F'..'\u1FFF'
| '\u200C'..'\u200D'
| '\u2070'..'\u218F'
| '\u2C00'..'\u2FEF'
| '\u3001'..'\uD7FF'
| '\uF900'..'\uFDCF'
| '\uFDF0'..'\uFFFD'
;

S : [ \t\r\n]+ -> skip;

The exported bookmarks sample.
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
It will be read and overwritten.
DO NOT EDIT! -->
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
<DT><H3 ADD_DATE="1481473849" LAST_MODIFIED="1481473992" PERSONAL_TOOLBAR_FOLDER="true">Bookmarks bar</H3>
<DL><p>
<DT><H3 ADD_DATE="1481473866" LAST_MODIFIED="1481473967">Test 1</H3>
<DL><p>
<DT><A HREF="https://encrypted.google.com/" ADD_DATE="1481473884" ICON="">Google</A>
<DT><A HREF="https://yandex.ru/" ADD_DATE="1481473892" ICON="">Яндекс</A>
<DT><A HREF="http://example.com/" ADD_DATE="1481473954">Example Domain</A>
</DL><p>
<DT><H3 ADD_DATE="1481473872" LAST_MODIFIED="1481473980">Test 2</H3>
<DL><p>
<DT><A HREF="https://duckduckgo.com/" ADD_DATE="1481473902" ICON="">DuckDuckGo</A>
<DT><A HREF="https://clojure.news/" ADD_DATE="1481473936" ICON="">Clojure News</A>
<DT><A HREF="http://example.com/" ADD_DATE="1481473955">Example Domain</A>
</DL><p>
<DT><A HREF="https://yandex.ru/" ADD_DATE="1481473893" ICON="">Яндекс</A>
<DT><A HREF="http://www.echojs.com/" ADD_DATE="1481473986" ICON=""></A>
<DT><A HREF="https://clojure.news/" ADD_DATE="1481473992" ICON=""></A>
<DT><H3 ADD_DATE="1481474004" LAST_MODIFIED="1481477692">Test 3</H3>
<DL><p>
<DT><A HREF="https://encrypted.google.com/" ADD_DATE="1481474004" ICON="">Google</A>
<DT><A HREF="https://duckduckgo.com/" ADD_DATE="1481474004" ICON="">DuckDuckGo</A>
<DT><A HREF="https://clojure.news/" ADD_DATE="1481474004" ICON="">Clojure News</A>
<DT><H3 ADD_DATE="1481477681" LAST_MODIFIED="1481477681">Test 4</H3>
<DL><p>
<DT><A HREF="https://clojure.news/" ADD_DATE="1481477681" ICON="">Clojure News</A>
<DT><A HREF="https://news.ycombinator.com/" ADD_DATE="1481477681" ICON="">Hacker News</A>
<DT><A HREF="http://example.com/" ADD_DATE="1481477681">Example Domain</A>
</DL><p>
<DT><A HREF="https://news.ycombinator.com/" ADD_DATE="1481474004" ICON="">Hacker News</A>
<DT><A HREF="http://example.com/" ADD_DATE="1481474004">Example Domain</A>
</DL><p>
</DL><p>
</DL><p>
clj-antlr library can be used to get the parse tree out of the grammer. Snippet to get the parse tree below. Use compiled version of the grammar for better performance.
(def bm (antlr/parser "/home/kadaj/dev/clojure/bookmarks-parser/grammar/Bookmarks.g4"))
(pprint (bm (slurp "/home/kadaj/dev/clojure/bookmarks-parser/resources/bookmarks.html")))
Which produces the following parse tree.
(:document
(:prolog "")
(:misc
"\n")
(:meta
"<"
"META"
(:attribute
(:attributeName "HTTP-EQUIV")
"="
(:attributeValue "\"Content-Type\""))
(:attribute
(:attributeName "CONTENT")
"="
(:attributeValue "\"text/html; charset=UTF-8\""))
">")
(:meta "<" "TITLE" ">" "Bookmarks" "")
(:meta "<" "H1" ">" "Bookmarks" "")
(:dl
"<"
"DL"
"><"
"p"
">"
(:dt
"<"
"DT"
"><"
(:tag "H3")
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473849\""))
(:attribute
(:attributeName "LAST_MODIFIED")
"="
(:attributeValue "\"1481473992\""))
(:attribute
(:attributeName "PERSONAL_TOOLBAR_FOLDER")
"="
(:attributeValue "\"true\""))
">"
(:content "Bookmarks" "bar")
" (:tag "H3")
">")
(:dt
(:dl
"<"
"DL"
"><"
"p"
">"
(:dt
"<"
"DT"
"><"
(:tag "H3")
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473866\""))
(:attribute
(:attributeName "LAST_MODIFIED")
"="
(:attributeValue "\"1481473967\""))
">"
(:content "Test" "1")
" (:tag "H3")
">")
(:dt
(:dl
"<"
"DL"
"><"
"p"
">"
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://encrypted.google.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473884\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "Google")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://yandex.ru/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473892\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "Яндекс")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"http://example.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473954\""))
">"
(:content "Example" "Domain")
" (:tag "A")
">")
" "DL"
"><"
"p"
">"))
(:dt
"<"
"DT"
"><"
(:tag "H3")
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473872\""))
(:attribute
(:attributeName "LAST_MODIFIED")
"="
(:attributeValue "\"1481473980\""))
">"
(:content "Test" "2")
" (:tag "H3")
">")
(:dt
(:dl
"<"
"DL"
"><"
"p"
">"
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://duckduckgo.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473902\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "DuckDuckGo")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://clojure.news/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473936\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "Clojure" "News")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"http://example.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473955\""))
">"
(:content "Example" "Domain")
" (:tag "A")
">")
" "DL"
"><"
"p"
">"))
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://yandex.ru/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473893\""))
(:attribute (:attributeName "ICON") "=" (:attributeValue "\"\""))
">"
(:content "Яндекс")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"http://www.echojs.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473986\""))
(:attribute (:attributeName "ICON") "=" (:attributeValue "\"\""))
"> (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://clojure.news/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481473992\""))
(:attribute (:attributeName "ICON") "=" (:attributeValue "\"\""))
"> (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "H3")
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481474004\""))
(:attribute
(:attributeName "LAST_MODIFIED")
"="
(:attributeValue "\"1481477692\""))
">"
(:content "Test" "3")
" (:tag "H3")
">")
(:dt
(:dl
"<"
"DL"
"><"
"p"
">"
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://encrypted.google.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481474004\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "Google")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://duckduckgo.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481474004\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "DuckDuckGo")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://clojure.news/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481474004\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "Clojure" "News")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "H3")
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481477681\""))
(:attribute
(:attributeName "LAST_MODIFIED")
"="
(:attributeValue "\"1481477681\""))
">"
(:content "Test" "4")
" (:tag "H3")
">")
(:dt
(:dl
"<"
"DL"
"><"
"p"
">"
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://clojure.news/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481477681\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "Clojure" "News")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://news.ycombinator.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481477681\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "Hacker" "News")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"http://example.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481477681\""))
">"
(:content "Example" "Domain")
" (:tag "A")
">")
" "DL"
"><"
"p"
">"))
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"https://news.ycombinator.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481474004\""))
(:attribute
(:attributeName "ICON")
"="
(:attributeValue "\"\""))
">"
(:content "Hacker" "News")
" (:tag "A")
">")
(:dt
"<"
"DT"
"><"
(:tag "A")
(:attribute
(:attributeName "HREF")
"="
(:attributeValue "\"http://example.com/\""))
(:attribute
(:attributeName "ADD_DATE")
"="
(:attributeValue "\"1481474004\""))
">"
(:content "Example" "Domain")
" (:tag "A")
">")
" "DL"
"><"
"p"
">"))
" "DL"
"><"
"p"
">"))
" "DL"
"><"
"p"
">"))