eBay is one of the largest online marketplaces with millions of active listings at any given time. In this tutorial, we'll walk through how to scrape and extract key data from eBay listings using C++ and the libcurl library.
Setup
We'll need the following libraries installed:
On Linux, you can install these using your package manager.
We'll also define the starting eBay URL to scrape and a string for the user agent header to spoof a browser visit:
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
const char* url = "<https://www.ebay.com/sch/i.html?_nkw=baseball>";
const char* userAgent = "Mozilla/5.0 ...";
Replace the user agent string with your own browser's user agent.
Fetch the Listings Page
We'll use the libcurl library to fetch the HTML content from the eBay URL:
CURL* curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_USERAGENT, userAgent);
// Perform request and get response
curl_easy_perform(curl);
char* response;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_STRING, &response);
xmlDocPtr doc = htmlReadMemory(response, strlen(response), "UTF-8", XML_PARSE_NOBLANKS | XML_PARSE_HUGE);
We set the user agent and then fetch the response string. This is parsed into an XML doc using libxml2.
Extract Listing Data
Now we can extract the key data points from each listing. eBay encloses each in a Then we can loop through each listing node and extract the text or attributes: We use Here is the full code to scrape and extract eBay listing data: This provides an overview of scraping eBay listings in C++ using libcurl and libxml2. The key steps are making the HTTP request, parsing the HTML, finding the listing nodes, and extracting the text/attributes.
Get HTML from any page with a simple API call. We handle proxy rotation, browser identities, automatic retries, CAPTCHAs, JavaScript rendering, etc automatically for you
curl "http://api.proxiesapi.com/?key=API_KEY&url=https://example.com" <!doctype html>xmlNodePtr root = xmlDocGetRootElement(doc);
xmlNodePtr listingItems[100];
int listingsLen = 0;
for (xmlNodePtr item = root->children; item; item = item->next) {
if (item->type == XML_ELEMENT_NODE && xmlStrEqual(item->name, BAD_CAST "div") &&
xmlHasProp(item, BAD_CAST "class") &&
strstr((char*) xmlGetProp(item, BAD_CAST "class"), "s-item__info")) {
listingItems[listingsLen++] = item;
}
}
for (int i = 0; i < listingsLen; i++) {
xmlChar* title = xmlNodeListGetString(doc, listingItems[i]->xmlChildrenNode, 1);
xmlChar* url = xmlGetProp(listingItems[i], BAD_CAST "href");
// And so on for other fields like price, seller etc.
cout << "Title: " << title << endl;
cout << "URL: " << url << endl;
xmlFree(title);
xmlFree(url);
}
Full Code
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
const char* url = "https://www.ebay.com/sch/i.html?_nkw=baseball";
const char* userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36";
CURL* curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_USERAGENT, userAgent);
curl_easy_perform(curl);
char* response;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_STRING, &response);
xmlDocPtr doc = htmlReadMemory(response, strlen(response), "UTF-8", XML_PARSE_NOBLANKS | XML_PARSE_HUGE);
xmlNodePtr root = xmlDocGetRootElement(doc);
xmlNodePtr listingItems[100];
int listingsLen = 0;
for (xmlNodePtr item = root->children; item; item = item->next) {
if (item->type == XML_ELEMENT_NODE && xmlStrEqual(item->name, BAD_CAST "div") &&
xmlHasProp(item, BAD_CAST "class") &&
strstr((char*)xmlGetProp(item, BAD_CAST "class"), "s-item__info")) {
listingItems[listingsLen++] = item;
}
}
for (int i = 0; i < listingsLen; i++) {
xmlChar* title = xmlNodeListGetString(doc, listingItems[i]->xmlChildrenNode, 1);
xmlChar* url = xmlGetProp(listingItems[i], BAD_CAST "href");
xmlChar* price = xmlNodeListGetString(doc, listingItems[i]->xmlChildrenNode, 5);
xmlChar* details = xmlNodeListGetString(doc, listingItems[i]->xmlChildrenNode, 7);
xmlChar* sellerInfo = xmlNodeListGetString(doc, listingItems[i]->xmlChildrenNode, 11);
xmlChar* shippingCost = xmlNodeListGetString(doc, listingItems[i]->xmlChildrenNode, 13);
xmlChar* location = xmlNodeListGetString(doc, listingItems[i]->xmlChildrenNode, 15);
xmlChar* sold = xmlNodeListGetString(doc, listingItems[i]->xmlChildrenNode, 17);
cout << "Title: " << title << endl;
cout << "URL: " << url << endl;
cout << "Price: " << price << endl;
cout << "Details: " << details << endl;
cout << "Seller: " << sellerInfo << endl;
cout << "Shipping: " << shippingCost << endl;
cout << "Location: " << location << endl;
cout << "Sold: " << sold << endl;
xmlFree(title);
xmlFree(url);
xmlFree(price);
xmlFree(details);
xmlFree(sellerInfo);
xmlFree(shippingCost);
xmlFree(location);
xmlFree(sold);
}
curl_easy_cleanup(curl);
xmlFreeDoc(doc);
Browse by tags:
Browse by language:
The easiest way to do Web Scraping
Try ProxiesAPI for free
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
...