/**************************************************************************
* FILE: html-data.c
* AUTHOR: James Johnson
* CREATE DATE: 01 June 2011
* PROJECT: shared
* COPYRIGHT: UQ, 2009
* VERSION: $Revision: 1.0 $
* DESCRIPTION: A callback based push parser for html documents designed
* to extract the tags and get their name
* and value attributes. Automatically skips the contents of comments and
* script or style tags. Keeps a count of all valid html tags that it sees.
**************************************************************************/
#include
#include
#include
#include "html-data.h"
#include "linked-list.h"
#include "red-black-tree.h"
#include "string-builder.h"
#include "string-match.h"
#include "utils.h"
/*
* Possible parser states
*/
enum hdata_state {
HDATA_READY, // looking for <
HDATA_TAGNAME, // seen < so reading following to determine if it's a tag or comment or just stuff
HDATA_INTAG, // seen valid html tag name, now reading attributes until end of tag
HDATA_ATTRNAME, // reading attribute name
HDATA_ATTRVALUE, // parse the value of a generic attribute
HDATA_SINGLEQUOTE, // parse an attribute ending in a single quote
HDATA_DOUBLEQUOTE, // parse an attribute ending in a double quote
HDATA_NOQUOTE, // parse an unquoted attribute value
HDATA_COMMENT, // after a
HDATA_SKIP // skiping content until a specific close tag
};
typedef enum hdata_state HDATA_STATE_EN;
/*
* Possible ways of using the current attribute value
*/
enum hdata_attr {
HDATA_IGNORE, // the current attribute can be safely ignored
HDATA_INPUT_TYPE, // the current attribute is the type attribute of an input tag
HDATA_INPUT_NAME, // the current attribute is the name attribute of an input tag
HDATA_INPUT_VALUE // the current attribute is the value attribute of an input tag
};
typedef enum hdata_attr HDATA_ATTR_EN;
/*
* Keeps track of tag sightings
*/
struct tag {
char *name; // name of the tag
long opened; // number of times an open tag has been seen
long closed; // number of times a close tag has been seen
long self_closed; // number of times a self closed tag has been seen
short last; // 0 = self closed (or unseen), 1 = opened last, -1 = closed last
BOOLEAN_T is_input_tag; // true if input tag
BMSTR_T *skip; // should the content be skipped (for script and style tags)
};
typedef struct tag TAG_T;
/*
* Keeps track of parser state
*/
struct hdata {
HDATA_FN callback; // callback function to alert the user about a hidden input
void *data; // data the user is using to keep track of state
size_t max_attr_len; // maximum attribute length to store
HDATA_STATE_EN state; // current state of parser
HDATA_ATTR_EN attr; // current attribute
TAG_T *tag; // current tag
long tag_count; // total count of html tags sighted
BOOLEAN_T leading_slash; // does the current tag have a leading slash
BOOLEAN_T trailing_slash; // does the current tag have a trailing slash
BOOLEAN_T type_hidden; // does the input tag have a type="hidden" attribute
char *name; // the name of the input tag
long name_overflow; // num of name chars discarded
char *value; // the value of the input tag
long value_overflow; // num of value chars discarded
STR_T *strb; // string builder
long overflow; // number of chars that couldn't fit in the string builder
RBTREE_T *tags; // names of valid html tags
BMSTR_T *comment_end; // pattern to match the end of a comment
RBTREE_T *prior_states; // used by update to ensure no accidental infinite loops
};
// Assumptions needed by this parser
// - Assume that a is found except
// inside quoted attributes of tags and the content of script and style tags
// where it will be ignored
// - Assume all to end
// - Assume all to end.
// - Assume that a was present
// - Assume the optional attributes of a tag will be defined as attr="..." or
// attr='...' or attr=...[whitespace] or attr where the first 2 forms are
// called quoted attributes
// - Assume the attr token does not contain > or /
// - Assume the whitespace terminated form of the attribute will not contain >
// or /
// - Assume if the attribute has no attr= then the attributes value is its name
// - Assume the ... of the quoted attributes may be anything but the terminating
// quote.
// - Assume that a forward slash seperated by zero or more whitespace from a >
// means that a script or style tag has no content to ignore
// - Assume elements