Overview
The rvest package allows you to scrape (extract data from) HTML web pages in R. It provides a variety of useful functions for selector-based web scraping and tools for parsing and analyzing HTML documents.
Main Functions
# Read a page
page <- read_html("<http://example.com>")
# Find all paragraphs
paragraphs <- html_nodes(page, "p")
# Extract text
text <- html_text(paragraphs)
# Extract html
html <- html_text2(paragraphs)
# Extract attributes
hrefs <- html_attr(paragraphs, "href")
# Extract HTML content of nodes
html_content(paragraphs)
# Find nodes by CSS selector
nodes <- html_nodes(page, "div.content")
# Extract info from table
table <- html_table(html_nodes(page, "table"))[[1]]
# Submit a form
submit_form(page, submit = "search", search = "rvest")
Selecting Nodes
Use css selectors to find nodes:
html_node(page, "div") # div elements
html_nodes(page, "div#intro") # div with id intro
html_nodes(page, "div.results") # div with class results
html_nodes(page, "div#intro.featured") # div with both id and class
html_nodes(page, "div > p") # p inside div
html_nodes(page, "ul > li:nth-child(2)") # second li in ul
Some other selector examples:
html_nodes(page, "a:contains('R')") # a containing R
html_nodes(page, ":not(div)") # nodes except div
html_nodes(page, "li:nth-child(-n+3)") # first 3 li
Extracting Information
Extract attributes, text, and HTML:
text <- html_text(nodes) # text content
html <- html_contents(nodes) # inner HTML
hrefs <- html_attr(links, "href") # attribute
imgs <- html_attr(img_nodes, "src") # attribute
Extract information from tables:
tables <- html_table(html_nodes(page, "table"))
df <- tables[[1]] # extract as dataframe
Use xpath selectors for more complex queries:
html_nodes(page, xpath = '//*[@id="intro"]/p') # xpath selector
html_text(html_nodes(page, xpath='//p[@class="summary"]')) # xpath and extract text
Parsing and Navigating
Parse document structure:
url <- "<http://example.com>"
page <- read_html(url)
title <- html_text(html_nodes(page, "title"))
h1 <- html_text(html_nodes(page, "h1"))
links <- html_nodes(page, "a") # all links
Navigate to other pages:
other_page <- read_html(links[12])
submit_form(page, submit = "login", user = "name", pass = "password")
Handling Issues
Set user agent to avoid bot blocking:
page <- read_html("<http://example.com>", user_agent = "Mozilla/5.0")
Deal with malformed HTML:
page <- read_html("<http://example.com>", encoding = "UTF-8") # specify encoding
html_name(nodes) # fix missing attributes
page <- repair_encoding(page, encoding = "ISO-8859-1") # detect and fix encoding
Advanced Usage
Full interaction using RSelenium:
remote_driver <- rsDriver(browser = "firefox")
remote_driver$open()
remote_driver$navigate("<http://example.com>")
page <- html_from_driver(remote_driver)
title <- html_text(html_nodes(page, "title"))
remote_driver$close()
Custom login:
page <- html_session("<http://example.com>") %>%
jump_to("login") %>%
fill("username", "user123") %>%
fill("password", "secret") %>%
submit_form(id = "login-form") %>%
jump_to("account")
Scrape JavaScript generated content:
page <- rvest::html_session("<http://example.com>")
page <- rvest::jump_to(page, "dynamicContent")
html <- rvest::html_text(page)
Best Practices
When scraping websites, be ethical and respectful:
# Add 2 second pause between requests
Sys.sleep(2)
More rvest Functions
Extract element names:
names <- html_name(nodes)
Extract child nodes:
children <- html_children(node)
Extract siblings:
siblings <- html_siblings(node)
Processing Extracted Data
Process extracted tables using tidyverse:
library(dplyr)
tables <- html_table(doc)
cleaned_data <- tables[[1]] %>%
mutate(new_column = ...) %>%
filter(...)
Tips and Tricks
Map rvest over multiple pages:
library(purrr)
urls <- c("page1", "page2")
map(urls, ~read_html(.x) %>%
html_nodes("p") %>%
html_text())
Troubleshooting
Handle HTTP errors like 404 or 503 status codes:
page <- tryCatch(
read_html("<http://example.com>"),
error = function(e) e
)
if(inherits(page, "error")) {
# handle error
} else {
# scrape page
}
Selectors
More examples of CSS selectors:
html_nodes(page, "div#id1") # by id
html_nodes(page, "div.classname") # by class
html_nodes(page, "a[href='about']") # attribute equals
html_nodes(page, "tr:nth-child(even)") # even rows
html_nodes(page, "li:contains('R')") # contains text
Navigation
Follow links and scrape multiple pages:
page1 <- read_html("<http://example.com>")
page2 <- read_html(html_attr(html_nodes(page1, "a"), "href")[5])
Submit forms and login:
page <- html_session("<http://example.com>")
page <- jump_to(page, "login")
page <- fill(page, "username", "john")
page <- fill(page, "password", "123456")
page <- submit_form(page)
page <- jump_to(page, "account")
Writing CSS Selectors
Tips for efficient CSS selectors:
RSelenium Examples
Interact with dynamic pages:
library(RSelenium)
driver <- rsDriver(browser = "chrome")
driver$navigate("<http://example.com>")
page <- driver$getPageSource()[[1]]
html_text(html_nodes(page, "#dynamic-content"))
driver$close()
Parsing XML
Parse XML with rvest using xml2:
library(xml2)
page <- read_html("data.xml")
xml <- html_xml(page)
nodes <- xml %>% xml_find_all("//item")
Large Data Strategies
Write to disk when scraping large data:
library(data.table)
dt <- html_table(doc)[[1]]
fwrite(dt, "output.csv")