HTMLParser is an Objective-C wrapper for libxml2 that allows parsing HTML documents. It provides an event-driven interface like NSXMLParser.
Getting Started
Import:
#import "HTMLParser.h"
Initialize:
NSData *data = ...; // HTML data
HTMLParser *parser = [[HTMLParser alloc] initWithData:data];
Set delegate:
parser.delegate = self;
Start parsing:
[parser parse];
Delegate Methods
Did start document:
- (void)parserDidStartDocument:(HTMLParser *)parser {
// Parsing started
}
Did end document:
- (void)parserDidEndDocument:(HTMLParser *)parser {
// Parsing done
}
Did start element:
- (void)parser:(HTMLParser *)parser
didStartElement:(NSString *)elementName
attributes:(NSDictionary *)attributeDict {
// Element opened
}
Did end element:
- (void)parser:(HTMLParser *)parser
didEndElement:(NSString *)elementName {
// Element closed
}
Found characters:
- (void)parser:(HTMLParser *)parser
foundCharacters:(NSString *)string {
// Found text
}
Parsing Options
No error reporting:
parser.reportErrors = NO;
Allow broken HTML:
parser.options.HTML_PARSE_NOERROR = YES;
Validation
Check errors:
NSArray *errors = [parser errors];
if([errors count] > 0) {
// Handle errors
}
Tips
Examples
Simple HTML Parsing:
@interface ParserDelegate : NSObject <HTMLParserDelegate>
@end
@implementation ParserDelegate
- (void)parser:(HTMLParser *)parser
didStartElement:(NSString *)elementName
attributes:(NSDictionary *)attributeDict {
NSLog(@"Started element: %@", elementName);
}
@end
NSData *htmlData = ...; // html
HTMLParser *parser = [[HTMLParser alloc] initWithData:htmlData];
ParserDelegate *delegate = [[ParserDelegate alloc] init];
parser.delegate = delegate;
[parser parse];
Extract Text:
NSMutableString *text;
- (void)parser:(HTMLParser *)parser
didStartElement:(NSString *)elementName {
text = [[NSMutableString alloc] init];
}
- (void)parser:(HTMLParser *)parser
foundCharacters:(NSString *)string {
[text appendString:string];
}
- (void)parserDidEndDocument:(HTMLParser *)parser {
NSLog(@"Text: %@", text);
}
Entity Conversion
Custom resolver:
@interface EntityResolver : NSObject <HTMLParserDelegate>
@end
@implementation EntityResolver
- (NSData *)parser:(HTMLParser *)parser
resolveExternalEntity:(NSString *)name {
if(name == ...) {
return replacementData;
} else {
return nil;
}
}
@end
Writing HTML
XMLElement:
XMLElement *element = [[XMLElement alloc] initWithName:@"div"];
[element setStringValue:@"Hello"];
NSString *html = [element XMLString];
XMLNode:
XMLNode *node = [[XMLNode alloc] initWithName:@"span"];
[node setStringValue:@"world"];
[element addChild:node];
Advanced Usage
Incremental parsing:
while(hasMoreData) {
NSData *chunk = getNextDataChunk();
[parser parseChunk:chunk];
}
[parser finishParsing];
Custom input stream:
NSInputStream *stream = ...; // custom stream
HTMLParser *parser = [[HTMLParser alloc] initWithStream:stream];
// read and parse chunks
Capture dynamic HTML:
// with UIWebView
NSString *html = [webView stringByEvaluatingJavaScript:@"document.body.innerHTML"];
HTMLParser *parser = [[HTMLParser alloc] initWithData:html];
Debugging
Parser errors:
NSArray *errors = [parser errors];
for(NSError *error in errors) {
NSLog(@"%@", error.localizedDescription);
}
Enable info messages:
[HTMLParser setInfoMessageLoggingEnabled:YES];
Handling Common Errors
HTML parsing can result in errors due to malformed content or encoding issues. Always check the errors array:
NSArray *errors = [parser errors];
if([errors count] > 0) {
// Handle errors
}
For malformed HTML, use the
parser.options.HTML_PARSE_NOERROR = YES;
To handle encoding errors, specify the encoding:
[parser setEncoding:NSUTF8StringEncoding];
More Delegate Examples
Extract all images:
- (void)parser:(HTMLParser *)parser
didStartElement:(NSString *)elementName
attributes:(NSDictionary *)attributeDict {
if([elementName isEqualToString:@"img"]) {
NSString *src = attributeDict[@"src"];
// Download image
}
}
Find link URLs:
- (void)parser:(HTMLParser *)parser
didStartElement:(NSString *)elementName
attributes:(NSDictionary *)attributeDict {
if([elementName isEqualToString:@"a"]) {
NSString *href = attributeDict[@"href"];
// Store link url
}
}
Performance Optimization
Reuse parser instances:
HTMLParser *parser = [[HTMLParser alloc] init];
for(NSData *html in htmlDataArray) {
[parser parse:html];
}
Incremental parsing:
while(hasDataAvailable) {
NSData *chunk = [self getNextDataChunk];
[parser parseChunk:chunk];
}
[parser finishParsing];
Advanced Usage
Parse HTML from a UIWebView:
NSString *html = [webView stringByEvaluatingJavaScript:@"document.body.innerHTML"];
HTMLParser *parser = [[HTMLParser alloc] initWithData:[html dataUsingEncoding:NSUTF8StringEncoding]];
Use a custom input stream:
NSInputStream *stream = [NSInputStream inputStreamWithURL:url];
HTMLParser *parser = [[HTMLParser alloc] initWithStream:stream];
while([stream hasBytesAvailable]) {
[parser parseChunk:[stream readDataOfLength:4096]];
}
Generating HTML
Build a complex HTML structure:
XMLElement *html = [[XMLElement alloc] initWithName:@"html"];
XMLElement *head = [[XMLElement alloc] initWithName:@"head"];
[html addChild:head];
XMLElement *body = [[XMLElement alloc] initWithName:@"body"];
[html addChild:body];
XMLNode *h1 = [[XMLNode alloc] initWithName:@"h1"];
[h1 setStringValue:@"Hello World!"];
[body addChild:h1];
NSString *htmlString = [html XMLString];
Troubleshooting
Enable parser info messages:
[HTMLParser setInfoMessageLoggingEnabled:YES];
Log validation errors:
NSArray *errors = [parser validationErrors];
for(NSError *error in errors) {
NSLog(@"%@", error);
}
Thread Safety
Perform parsing on a background thread:
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
HTMLParser *parser = [[HTMLParser alloc] initWithData:data];
[parser parse];
});