/************************************************************************** * FILE: html-data.c * AUTHOR: James Johnson * CREATE DATE: 01 June 2011 * PROJECT: shared * COPYRIGHT: UQ, 2009 * VERSION: $Revision: 1.0 $ * DESCRIPTION: A callback based push parser for html documents designed * to extract the tags and get their name * and value attributes. Automatically skips the contents of comments and * script or style tags. Keeps a count of all valid html tags that it sees. **************************************************************************/ #include #include #include #include "html-data.h" #include "linked-list.h" #include "red-black-tree.h" #include "string-builder.h" #include "string-match.h" #include "utils.h" /* * Possible parser states */ enum hdata_state { HDATA_READY, // looking for < HDATA_TAGNAME, // seen < so reading following to determine if it's a tag or comment or just stuff HDATA_INTAG, // seen valid html tag name, now reading attributes until end of tag HDATA_ATTRNAME, // reading attribute name HDATA_ATTRVALUE, // parse the value of a generic attribute HDATA_SINGLEQUOTE, // parse an attribute ending in a single quote HDATA_DOUBLEQUOTE, // parse an attribute ending in a double quote HDATA_NOQUOTE, // parse an unquoted attribute value HDATA_COMMENT, // after a HDATA_SKIP // skiping content until a specific close tag }; typedef enum hdata_state HDATA_STATE_EN; /* * Possible ways of using the current attribute value */ enum hdata_attr { HDATA_IGNORE, // the current attribute can be safely ignored HDATA_INPUT_TYPE, // the current attribute is the type attribute of an input tag HDATA_INPUT_NAME, // the current attribute is the name attribute of an input tag HDATA_INPUT_VALUE // the current attribute is the value attribute of an input tag }; typedef enum hdata_attr HDATA_ATTR_EN; /* * Keeps track of tag sightings */ struct tag { char *name; // name of the tag long opened; // number of times an open tag has been seen long closed; // number of times a close tag has been seen long self_closed; // number of times a self closed tag has been seen short last; // 0 = self closed (or unseen), 1 = opened last, -1 = closed last BOOLEAN_T is_input_tag; // true if input tag BMSTR_T *skip; // should the content be skipped (for script and style tags) }; typedef struct tag TAG_T; /* * Keeps track of parser state */ struct hdata { HDATA_FN callback; // callback function to alert the user about a hidden input void *data; // data the user is using to keep track of state size_t max_attr_len; // maximum attribute length to store HDATA_STATE_EN state; // current state of parser HDATA_ATTR_EN attr; // current attribute TAG_T *tag; // current tag long tag_count; // total count of html tags sighted BOOLEAN_T leading_slash; // does the current tag have a leading slash BOOLEAN_T trailing_slash; // does the current tag have a trailing slash BOOLEAN_T type_hidden; // does the input tag have a type="hidden" attribute char *name; // the name of the input tag long name_overflow; // num of name chars discarded char *value; // the value of the input tag long value_overflow; // num of value chars discarded STR_T *strb; // string builder long overflow; // number of chars that couldn't fit in the string builder RBTREE_T *tags; // names of valid html tags BMSTR_T *comment_end; // pattern to match the end of a comment RBTREE_T *prior_states; // used by update to ensure no accidental infinite loops }; // Assumptions needed by this parser // - Assume that a is found except // inside quoted attributes of tags and the content of script and style tags // where it will be ignored // - Assume all to end // - Assume all to end. // - Assume that a was present // - Assume the optional attributes of a tag will be defined as attr="..." or // attr='...' or attr=...[whitespace] or attr where the first 2 forms are // called quoted attributes // - Assume the attr token does not contain > or / // - Assume the whitespace terminated form of the attribute will not contain > // or / // - Assume if the attribute has no attr= then the attributes value is its name // - Assume the ... of the quoted attributes may be anything but the terminating // quote. // - Assume that a forward slash seperated by zero or more whitespace from a > // means that a script or style tag has no content to ignore // - Assume elements