KSoup is an HTML parser for Kotlin built on top of JSoup. It provides a very convenient DSL for extracting and manipulating data from HTML documents.
Getting Started
Add dependency:
implementation("io.github.webtools:ksoup:0.3.0")
Parse HTML:
val html = """
<html>
...
</html>
"""
val doc = KSoup.parse(html)
Find elements:
doc.select(".content")
Extract text:
doc.body()!!.text()
Selecting
By CSS query:
doc.select(".main")
By tag:
doc.select("img")
By id:
doc.getElementById("header")
By attribute:
doc.select("[href]")
Custom filters:
doc.select(".txt").filter { it.text().length > 10 }
Traversing
Children:
element.children()
Parents:
element.parents()
Siblings:
element.nextSibling()
element.previousSibling()
Manipulation
Set text:
element.text("new text")
Set HTML:
element.html("<span>new html</span>")
Add class:
element.addClass("highlighted")
Remove class:
element.removeClass("highlighted")
Remove element:
element.remove()
Attributes
Get attribute:
element.attr("href")
Set attribute:
element.attr("href", "link.html")
Remove attribute:
element.removeAttr("class")
Examples
Extract text from paragraphs:
doc.select("p").forEach {
println(it.text())
}
Extract links:
doc.select("a[href]").forEach {
println(it.attr("href"))
}
Change image src:
doc.select("img").forEach {
it.attr("src", "new.png")
}
Validation
Check valid HTML:
val errors = KSoupValidator().validate(doc)
if (errors.isNotEmpty()) {
// handle errors
}
Advanced Usage
Async parsing:
KSoup.parseAsync(html) { doc ->
// process doc
}
Multi-threading:
docs.map { doc ->
thread {
// extract data
}
}
More Examples
val links = doc.select("a[href]").map { it.attr("href") }
val headers = doc.select("h1, h2, h3").map { it.text() }
doc.select(".ad").remove()
Tips & Tricks
doc.select(".news").hasClass("updated")
doc.select("a").hasAttr("target")
doc.select(".news").has("img")
val html = doc.select("p").outerHtml()
KSoup.parse(htmlFragment, "")
Threading
GlobalScope.launch {
val doc = KSoup.parseAsync(html)
// process doc
}
doc.select(".chapter").map { chapter ->
thread {
// extract data from each chapter
}
}
Validation
val rules = object : ValidatorRules {
override fun getTagRules() = //...
}
KSoupValidator(rules).validate(doc)
KSoupValidator().ignore(MissingAltText::class).validate(doc)
KSoupValidator().autoCorrect().validate(doc)