Ecosyste.ms: Awesome
An open API service indexing awesome lists of open source software.
https://github.com/zadean/htmerl
HTML Parser in Erlang
https://github.com/zadean/htmerl
erlang html-parser html5
Last synced: 3 months ago
JSON representation
HTML Parser in Erlang
- Host: GitHub
- URL: https://github.com/zadean/htmerl
- Owner: zadean
- License: apache-2.0
- Created: 2019-06-14T10:54:13.000Z (over 5 years ago)
- Default Branch: main
- Last Pushed: 2023-02-12T12:42:32.000Z (almost 2 years ago)
- Last Synced: 2024-10-12T04:33:45.245Z (4 months ago)
- Topics: erlang, html-parser, html5
- Language: Erlang
- Size: 60.5 KB
- Stars: 14
- Watchers: 2
- Forks: 2
- Open Issues: 0
-
Metadata Files:
- Readme: README.md
- License: LICENSE
Awesome Lists containing this project
README
htmerl
=====An OTP library for parsing HTML documents.
This library attempts to follow the [HTML 5.2 specification](https://www.w3.org/TR/html52/)
for tokenizing and parsing the HTML syntax as closely as possible.
This means that common errors that browsers accept are also accepted here and sanitized.The output from `htmerl:sax/2` is identical to the XML SAX events produced
by `xmerl_sax_parser` except that here all values and names are UTF-8 binary
and not lists.Usage
-----
There are two ways to use `htmerl`.
Firstly, to build a tree directly from the parsed input. Notice here that the missing "head" element was added.```erlang
1> htmerl:simple(<<"Hello">>).
{htmlDocument,<<"html">>,<<>>,<<>>,
[{htmlElement,<<"html">>,<<"http://www.w3.org/1999/xhtml">>,
[],
[{htmlElement,<<"head">>,<<"http://www.w3.org/1999/xhtml">>,
[],[]},
{htmlElement,<<"body">>,<<"http://www.w3.org/1999/xhtml">>,
[],
[{htmlText,<<"Hello">>,text}]}]}]}
```Secondly, as a SAX parser. Calling `htmerl:sax/1` returns a list of SAX events.
`htmerl:sax/2` calls a user defined function.```erlang
2> htmerl:sax(<<"Hello">>).
{ok,[startDocument,
{startDTD,<<"html">>,<<>>,<<>>},
endDTD,
{startPrefixMapping,<<>>,<<"http://www.w3.org/1999/xhtml">>},
{startElement,<<"http://www.w3.org/1999/xhtml">>,<<"html">>,
{<<>>,<<"html">>},
[]},
{startElement,<<"http://www.w3.org/1999/xhtml">>,<<"head">>,
{<<>>,<<"head">>},
[]},
{endElement,<<"http://www.w3.org/1999/xhtml">>,<<"head">>,
{<<>>,<<"head">>}},
{startElement,<<"http://www.w3.org/1999/xhtml">>,<<"body">>,
{<<>>,<<"body">>},
[]},
{characters,<<"Hello">>},
{endElement,<<"http://www.w3.org/1999/xhtml">>,<<"body">>,
{<<>>,<<"body">>}},
{endElement,<<"http://www.w3.org/1999/xhtml">>,<<"html">>,
{<<>>,<<"html">>}},
{endPrefixMapping,<<>>},
endDocument],
[]}
```or with a user defined function and state
```erlang
3> F = fun(E, _, S) -> io:format("Event: ~p~n", [E]), S end,
Opts = [{event_fun, F}, {user_state, []}],
htmerl:sax(<<"Hello">>, Opts).
Event: startDocument
Event: {startDTD,<<"html">>,<<>>,<<>>}
Event: endDTD
Event: {startPrefixMapping,<<>>,<<"http://www.w3.org/1999/xhtml">>}
Event: {startElement,<<"http://www.w3.org/1999/xhtml">>,<<"html">>,
{<<>>,<<"html">>},
[]}
Event: {startElement,<<"http://www.w3.org/1999/xhtml">>,<<"head">>,
{<<>>,<<"head">>},
[]}
Event: {endElement,<<"http://www.w3.org/1999/xhtml">>,<<"head">>,
{<<>>,<<"head">>}}
Event: {startElement,<<"http://www.w3.org/1999/xhtml">>,<<"body">>,
{<<>>,<<"body">>},
[]}
Event: {characters,<<"Hello">>}
Event: {endElement,<<"http://www.w3.org/1999/xhtml">>,<<"body">>,
{<<>>,<<"body">>}}
Event: {endElement,<<"http://www.w3.org/1999/xhtml">>,<<"html">>,
{<<>>,<<"html">>}}
Event: {endPrefixMapping,<<>>}
Event: endDocument
{ok,[],[]}
```or extracting values using the SAX events in a module:
```erlang
-module(htmerl_example).-export([run/0]).
run() ->
Html =
<<"Check
nothing herethis bold garbage
g"
"arbageout!
">>,
XPath = <<"html/body/p">>,
Path =
lists:reverse(
binary:split(XPath, <<"/">>, [global])),
Opts = [{event_fun, fun xpath/3}, {user_state, {[], Path, []}}],
{ok, TextList, []} = htmerl:sax(Html, Opts),
TextList.xpath({characters, Text}, _LineNum, {Path, Path, Acc}) ->
{Path, Path, [Text | Acc]};
xpath({endElement, _Ns, Ln, _}, _LineNum, {[Ln | Path], XPath, Acc}) ->
{Path, XPath, Acc};
xpath({startElement, _Ns, Ln, _, _Atts}, _LineNum, {Path, XPath, Acc}) ->
{[Ln | Path], XPath, Acc};
xpath(endDocument, _LineNum, {_Path, _XPath, Acc}) ->
lists:reverse(Acc);
xpath(_Event, _LineNum, State) ->
State.
``````erlang
4> htmerl_example:run().
[<<"Check">>,<<"this">>,<<"out!">>]
```Build
-----$ rebar3 compile