The Ultimate html5ever Cheat Sheet for Rust

html5ever is an HTML5 parser built for speed and correctness in Rust. This cheat sheet aims to cover its features exhaustively.

Installation

Add to Cargo.toml:

[dependencies]
html5ever = "0.25"

Parsing

From string:

let html = r#"<html>...</html>"#;
let doc = parse_document(html.as_bytes());

From bytes:

let bytes: Vec<u8> = fetch_bytes();
let doc = parse_document(bytes);

From reader:

let mut reader = File::open("doc.html")?;
let doc = parse_document(&mut reader);

From file:

let bytes = fs::read("doc.html")?;
let doc = parse_document(bytes);

Custom options:

let opts = ParseOpts::default().scripting_enabled(true);
let doc = parse_document_with_opts(html.as_bytes(), opts);

Serialization

To string:

let html = serialize(&doc, Default::default());

To writer:

let mut buffer = Vec::new();
serialize(&doc, &mut buffer);

To file:

serialize(&doc, File::create("out.html")?);

Custom options:

let opts = SerializeOpts::default()
  .minify(true)
  .format(SerializeFormat::HTML);

serialize(&doc, opts);

Traversal

Child elements:

for child in root.children() {
  // ...
}

Descendants:

fn traverse(node: &Node) {
  for child in node.children() {
    traverse(child);
  }
}

Parent element:

let parent = node.parent_element();

Previous sibling:

let prev = node.prev_sibling();

Next sibling:

let next = node.next_sibling();

Modification

Append child:

parent.append_child(&new_node);

Insert before:

parent.insert_before(&new_node, &ref_node);

Remove child:

parent.remove_child(&child);

Replace child:

parent.replace_child(&new, &old);

Set attribute:

el.set_attribute("class", "blue");

Set id:

el.set_id("main");

Set text content:

el.set_text_content(Some("Hello!"));

Creation

New element:

let el = Element::new(local_name!("div"));

New text node:

let text = TextNode::new("Hi there!");

New comment:

let comment = Comment::new("A comment");

Document fragment:

let frag = DocumentFragment::new();

Namespaces

let ns = Namespace::new(None, local_name!("svg"));
doc.namespace_bindings_mut().push(ns);

Namespaced element:

let circle = Element::new(local_name!("circle"), &["svg"]);

Attributes

Boolean attribute:

el.set_bool_attribute(local_name!("hidden"), true);

Custom attribute:

el.set_custom_attribute(local_name!("data-id"), Atom::from("123"));

Encoding

From encoded bytes:

let bytes = include_bytes!("doc.html");
let encoding = EncodingRef::Utf8;
let doc = parse_document_from_utf8_expecting(bytes, encoding);

Handle invalid sequences:

let opts = ParseOpts::default().replace_invalid_codepoints(true);

Validation

DTD validate:

let dtd = include_bytes!("doctype.dtd");
parse_document(html).validate_dtd(dtd);

Schema validate:

let schema = include_bytes!("schema.xsd");
parse_document(html).validate_schema(schema);

Performance

Parallel parsing:

let html = include_str!("doc.html");
let doc = parse_html_parallel(html, &Default::default());

Real World Uses

HTML parsers, validators, converters

Web scraping and automation

Archiving sites

Sanitizing/filtering HTML

Migrating between systems

Building HTML editors and CMSes

Data extraction

PDF generation

Static site generator

HTML testing suites

Browser engine integration

The Ultimate html5ever Cheat Sheet for Rust

Installation

Parsing

Serialization

Traversal

Modification

Creation

Namespaces

Attributes

Encoding

Validation

Performance

Real World Uses

Browse by language:

The easiest way to do Web Scraping

The Ultimate html5ever Cheat Sheet for Rust

Installation

Parsing

Serialization

Traversal

Modification

Creation

Namespaces

Attributes

Encoding

Validation

Performance

Real World Uses

The easiest way to do Web Scraping

Don't leave just yet!