/* autoDtd - Give this a XML document to look at and it will come up with a * DTD to describe it, and possibly some more readable and informative outputs * as well. */ /* This file is copyright 2005 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "xap.h" void usage() /* Explain usage and exit. */ { errAbort( "autoDtd - Give this a XML document to look at and it will come up with a DTD\n" "to describe it.\n" "usage:\n" " autoDtd in.xml out.dtd out.stats\n" "options:\n" " -tree=out.tree - Output tag tree.\n" " -atree=out.atree - Output attributed tag tree.\n" ); } static struct optionSpec options[] = { {"tree", OPTION_STRING}, {"atree", OPTION_STRING}, {NULL, 0}, }; struct type /* Information on a type. */ { struct type *next; char *name; /* Name of type/field. */ int count; /* Number of occurences of this tag. */ struct hash *attHash; /* Hash of all elements keyed by name */ struct attribute *attributes; struct hash *elHash; /* Hash of all elements keyed by type->name */ struct element *elements; struct attribute *textAttribute; /* Information on text. */ }; struct attribute /* Information on an attribute */ { struct attribute *next; char *name; int count; /* Number of times we've seen this attribute. */ boolean isOptional; /* True if it's not always there. */ boolean nonInt; /* True if not an int. */ boolean nonFloat; /* True if not a number. */ boolean seenThisRound; /* True if seen this round. */ struct hash *values; /* Hash of unique values. */ int maxLen; /* Maximum length */ }; struct element /* Information on an element */ { struct element *next; struct type *type; /* Element type */ boolean isOptional; /* True if it's optional. */ boolean isList; /* True if it's a list. */ boolean seenThisRound; /* True if seen this round. */ }; struct hash *typeHash; /* Keyed by struct type */ struct type *topType; /* Highest level type */ boolean hasLeftPaddedZero(char *s) /* does the string have a leading zero */ { if (strlen(s) < 2) return FALSE; return s[0]=='0'; } boolean isAllUInt(char *s) /* Return true if it looks like an unsigned integer */ { char c; while ((c = *s++) != 0) if (!isdigit(c)) return FALSE; return TRUE; } boolean isAllInt(char *s) /* Return true if it looks like an integer */ { if (*s == '-') ++s; return isAllUInt(s) && !hasLeftPaddedZero(s); } boolean isAllFloat(char *s) /* Return true if it looks like an floating point */ { char *point = strchr(s,'.'); if (!point) return isAllInt(s); if (!isAllUInt(point+1)) return FALSE; char *temp=cloneStringZ(s,point-s); boolean result = isAllInt(temp); freeMem(temp); return result; } void *startHandler(struct xap *xap, char *name, char **atts) /* Called at the start of a tag after attributes are parsed. */ { int i; struct type *type = hashFindVal(typeHash, name); struct attribute *att; struct element *el; if (type == NULL) { AllocVar(type); hashAddSaveName(typeHash, name, type, &type->name); type->elHash = hashNew(6); type->attHash = hashNew(6); } /* Zero out seenThisRound flags */ for (el = type->elements; el != NULL; el = el->next) el->seenThisRound = FALSE; for (att = type->attributes; att != NULL; att = att->next) att->seenThisRound = FALSE; for (i=0; atts[i] != NULL; i += 2) { char *name = atts[i], *val = atts[i+1]; int valLen = strlen(val); att = hashFindVal(type->attHash, name); if (att == NULL) { AllocVar(att); hashAddSaveName(type->attHash, name, att, &att->name); att->values = hashNew(16); slAddTail(&type->attributes, att); if (type->count != 0) att->isOptional = TRUE; } att->count += 1; hashStore(att->values, val); if (valLen > att->maxLen) att->maxLen = valLen; if (!att->nonInt) if (!isAllInt(val) || hasLeftPaddedZero(val)) att->nonInt = TRUE; if (!att->nonFloat) if (!isAllFloat(val)) att->nonFloat = TRUE; att->seenThisRound = TRUE; } for (att = type->attributes; att != NULL; att = att->next) { if (!att->seenThisRound) att->isOptional = TRUE; } if (xap->stackDepth > 1) { struct xapStack *st = xap->stack+1; struct type *parent = st->object; el = hashFindVal(parent->elHash, name); if (el == NULL) { AllocVar(el); hashAdd(parent->elHash, name, el); el->type = type; slAddTail(&parent->elements, el); if (parent->count != 0) el->isOptional = TRUE; } if (el->seenThisRound) el->isList = TRUE; el->seenThisRound = TRUE; } return type; } void endHandler(struct xap *xap, char *name) /* Called at end of a tag */ { struct type *type = xap->stack->object; char *text = skipLeadingSpaces(xap->stack->text->string); struct element *el; for (el = type->elements; el != NULL; el = el->next) { if (!el->seenThisRound) el->isOptional = TRUE; } if (text[0] == 0) { if (type->textAttribute != NULL) type->textAttribute->isOptional = TRUE; } else { int textLen = strlen(text); struct attribute *att = type->textAttribute; if (att == NULL) { type->textAttribute = AllocVar(att); att->name = ""; att->values = hashNew(16); if (type->count != 0) att->isOptional = TRUE; } if (att->maxLen < textLen) att->maxLen = textLen; hashStore(att->values, text); att->count += 1; if (!att->nonInt) if (!isAllInt(text) || hasLeftPaddedZero(text)) att->nonInt = TRUE; if (!att->nonFloat) if (!isAllFloat(text)) att->nonFloat = TRUE; } type->count += 1; topType = type; } char *attDataType(struct attribute *att) /* Return data type associated with attribute as a string */ { if (!att->nonInt) return "int"; else if (!att->nonFloat) return "float"; else return "string"; } void rWriteDtd(FILE *dtdFile, FILE *statsFile, struct type *type, struct hash *uniqHash) /* Recursively write out DTD. */ { struct element *el; struct attribute *att; int elCount = slCount(type->elements); boolean multiline; if (type->textAttribute != NULL) elCount += 1; multiline = (elCount > 3); hashAdd(uniqHash, type->name, type); fprintf(dtdFile, "name); if (multiline) fprintf(dtdFile, "\n"); for (el = type->elements; el != NULL; el = el->next) { if (multiline) fprintf(dtdFile, "\t"); fprintf(dtdFile, "%s", el->type->name); if (el->isList) { if (el->isOptional) fprintf(dtdFile, "*"); else fprintf(dtdFile, "+"); } else { if (el->isOptional) fprintf(dtdFile, "?"); } if (el->next != NULL || type->textAttribute != NULL) fprintf(dtdFile, ", "); if (multiline) fprintf(dtdFile, "\n"); } if (type->textAttribute != NULL) { if (multiline) fprintf(dtdFile, "\t"); if (!type->textAttribute->nonInt) fprintf(dtdFile, "%%INTEGER;"); else if (!type->textAttribute->nonFloat) fprintf(dtdFile, "%%REAL;"); else fprintf(dtdFile, "#PCDATA"); if (multiline) fprintf(dtdFile, "\n"); } fprintf(dtdFile, ")>\n"); fprintf(statsFile, "%s %d\n", type->name, type->count); if ((att = type->textAttribute) != NULL) { fprintf(statsFile, "\t%s\t%d\t%s\t%d\t%d\n", att->name, att->maxLen, attDataType(att), att->count, att->values->elCount); } else { fprintf(statsFile, "\t\t0\tnone\t0\t0\n"); } for (att = type->attributes; att != NULL; att = att->next) { fprintf(dtdFile, "name, att->name); if (!att->nonInt) fprintf(dtdFile, "%%int;"); else if (!att->nonFloat) fprintf(dtdFile, "%%float;"); else fprintf(dtdFile, "CDATA"); if (att->isOptional) fprintf(dtdFile, " #IMPLIED"); else fprintf(dtdFile, " #REQUIRED"); fprintf(dtdFile, ">\n"); fprintf(statsFile, "\t%s\t%d\t%s\t%d\t%d\n", att->name, att->maxLen, attDataType(att), att->count, att->values->elCount); } fprintf(dtdFile, "\n"); fprintf(statsFile, "\n"); /* Now recurse if we haven't written children yet. */ for (el = type->elements; el != NULL; el = el->next) { if (!hashLookup(uniqHash, el->type->name)) { rWriteDtd(dtdFile, statsFile, el->type, uniqHash); } } } void writeDtd(char *dtdFileName, char *statsFileName, char *xmlFileName, struct type *type) /* Write out DTD. */ { struct hash *uniqHash = newHash(0); /* Prevent writing dup defs for shared types. */ FILE *dtdFile = mustOpen(dtdFileName, "w"); FILE *statsFile = mustOpen(statsFileName, "w"); fprintf(dtdFile, "\n\n", xmlFileName); fprintf(dtdFile, "\n"); fprintf(dtdFile, "\n"); fprintf(dtdFile, "\n\n"); fprintf(dtdFile, "\n"); fprintf(dtdFile, "\n"); fprintf(dtdFile, "\n\n"); fprintf(dtdFile, "\n", xmlFileName); fprintf(statsFile, "#Statistics on %s\n", xmlFileName); fprintf(statsFile, "#Format is:\n"); fprintf(statsFile, "# \n"); fprintf(statsFile, "# <> \n"); fprintf(statsFile, "# \n"); fprintf(statsFile, "\n"); rWriteDtd(dtdFile, statsFile, type, uniqHash); carefulClose(&dtdFile); carefulClose(&statsFile); } void writeAttValType(FILE *f, struct attribute *att) /* Write out #, % or ? depending if type is int, float, or string */ { if (!att->nonInt) fprintf(f, "#"); else if (!att->nonFloat) fprintf(f, "%%"); else fprintf(f, "$"); } void writeAttribute(FILE *f, struct attribute *att) /* Write out information about attribute. */ { fprintf(f, " "); writeAttValType(f, att); fprintf(f, "%s", att->name); if (att->isOptional) fprintf(f, "?"); } void rWriteTree(FILE *f, struct type *type, boolean isOptional, boolean isList, struct hash *uniqHash, boolean withAttributes, int level) /* Write out type and it's children. */ { struct attribute *att; struct element *el; static struct type *parentStack[256]; int i; spaceOut(f, level*2); if (withAttributes && type->textAttribute != NULL) writeAttValType(f, type->textAttribute); fprintf(f, "%s", type->name); if (isList) if (isOptional) fprintf(f, "*"); else fprintf(f, "+"); else if (isOptional) fprintf(f, "?"); if (withAttributes) { for (att = type->attributes; att != NULL; att = att->next) writeAttribute(f, att); } fprintf(f, "\n"); if (level >= ArraySize(parentStack)) errAbort("Recursion too deep in rWriteTree"); parentStack[level] = type; for (i=level-1; i>= 0; i -= 1) if (type == parentStack[i]) return; /* Avoid cycling on self. */ for (el = type->elements; el != NULL; el = el->next) rWriteTree(f, el->type, el->isOptional, el->isList, uniqHash, withAttributes, level+1); } void writeTree(char *fileName, struct type *root, boolean withAttributes) /* Write out type tree to file. */ { struct hash *uniqHash = newHash(0); /* Prevent writing dup defs. */ FILE *f = mustOpen(fileName, "w"); rWriteTree(f, root, FALSE, FALSE, uniqHash, withAttributes, 0); carefulClose(&f); } void autoDtd(char *inXml, char *outDtd, char *outStats, char *treeFileName, char *atreeFileName) /* autoDtd - Give this a XML document to look at and it will come up with a * DTD to describe it.. */ { struct xap *xap = xapNew(startHandler, endHandler, inXml); typeHash = newHash(0); xapParseFile(xap, inXml); writeDtd(outDtd, outStats, inXml, topType); if (treeFileName != NULL) writeTree(treeFileName, topType, FALSE); if (atreeFileName != NULL) writeTree(atreeFileName, topType, TRUE); } int main(int argc, char *argv[]) /* Process command line. */ { char *treeFileName = NULL, *atreeFileName = NULL; optionInit(&argc, argv, options); if (argc != 4) usage(); treeFileName = optionVal("tree", treeFileName); atreeFileName = optionVal("atree", atreeFileName); autoDtd(argv[1], argv[2], argv[3], treeFileName, atreeFileName); return 0; }