/* hgLoadSeq - load sequences into the seq/extFile tables. */ #include "common.h" #include "options.h" #include "portable.h" #include "linefile.h" #include "hash.h" #include "fa.h" #include "hgRelate.h" /* command line option specifications */ static struct optionSpec optionSpecs[] = { {"abbr", OPTION_STRING}, {"prefix", OPTION_STRING}, {"replace", OPTION_BOOLEAN}, {"drop", OPTION_BOOLEAN}, {"test", OPTION_BOOLEAN}, {"seqTbl", OPTION_STRING}, {"extFileTbl", OPTION_STRING}, {NULL, 0} }; /* Command line options and defaults. */ char *seqTbl = "seq"; char *extFileTbl = "extFile"; char *abbr = NULL; char *prefix = NULL; boolean test = FALSE; boolean replace = FALSE; boolean drop = FALSE; char seqTableCreate[] = /* This keeps track of a sequence. */ "create table %s (" "id int unsigned not null primary key," /* Unique ID across all tables. */ "acc varchar(128) not null ," /* seq ID. */ "size int unsigned not null," /* Size of sequence in bases. */ "gb_date date not null," /* GenBank last modified date, * not used, for compatbility with older databases. */ "extFile int unsigned not null," /* File it is in. */ "file_offset bigint not null," /* Offset in file. */ "file_size int unsigned not null," /* Size in file. */ /* Extra indices. */ "unique (acc))"; boolean faSeekNextRecord(struct lineFile *faLf) /* Seeks to the next FA record. Returns FALSE if seeks to EOF. */ { char *faLine; int faLineSize; while (lineFileNext(faLf, &faLine, &faLineSize)) { if (faLine[0] == '>') return TRUE; } return FALSE; } void abbreviate(char *s, char *fluff) /* Cut out fluff from s. */ { int len; if (s != NULL && fluff != NULL) { s = strstr(s, fluff); if (s != NULL) { len = strlen(fluff); strcpy(s, s+len); } } } boolean readFaSeq(struct lineFile *faLf, char **retFaName, int *retDnaSize, off_t *retFaOffset) /* Read the next record, returning it's start location in the file */ { // to get offset, must read first line, save offset, then read the record to // get the size char *faLine; if (!lineFileNext(faLf, &faLine, NULL)) return FALSE; if (faLine[0] != '>') errAbort("fasta record doesn't start with '>' line %d of %s", faLf->lineIx, faLf->fileName); *retFaOffset = faLf->bufOffsetInFile + faLf->lineStart; lineFileReuse(faLf); DNA *dna; boolean gotIt = faMixedSpeedReadNext(faLf, &dna, retDnaSize, retFaName); if (!gotIt) internalErr(); return TRUE; } boolean loadFaSeq(struct lineFile *faLf, HGID extFileId, HGID seqId, FILE *seqTab, struct sqlConnection* conn) /* Add next sequence in fasta file to tab file */ { off_t faOffset, faEndOffset; int faSize, dnaSize; char *faName, faAcc[256], faAccBuf[513]; int prefixLen = 0; /* Get next FA record. */ if (!readFaSeq(faLf, &faName, &dnaSize, &faOffset)) return FALSE; char *s = firstWordInLine(faName); abbreviate(s, abbr); if (strlen(s) == 0) errAbort("Missing accession line %d of %s", faLf->lineIx, faLf->fileName); if (prefix != NULL) prefixLen = strlen(prefix) + 1; if (strlen(faName+1) + prefixLen >= sizeof(faAcc)) errAbort("Fasta name too long line %d of %s", faLf->lineIx, faLf->fileName); faAcc[0] = 0; if (prefix != NULL) { safecat(faAcc, sizeof(faAcc), prefix); safecat(faAcc, sizeof(faAcc), "-"); } strcat(faAcc, s); faEndOffset = faLf->bufOffsetInFile + faLf->lineStart; faSize = (int)(faEndOffset - faOffset); /* note: sqlDate column is empty */ fprintf(seqTab, "%u\t%s\t%d\t0000-00-00\t%u\t%lld\t%d\n", seqId, sqlEscapeTabFileString2(faAccBuf, faAcc), dnaSize, extFileId, (unsigned long long)faOffset, faSize); return TRUE; } void loadFa(char *faFile, struct sqlConnection *conn, FILE *seqTab, HGID *nextSeqId) /* Add sequences in a fasta file to a seq table tab file */ { HGID extFileId = test ? 0 : hgAddToExtFileTbl(faFile, conn, extFileTbl); struct lineFile *faLf = lineFileOpen(faFile, TRUE); unsigned count = 0; verbose(1, "Adding %s\n", faFile); /* Seek to first line starting with '>' in line file. */ if (!faSeekNextRecord(faLf)) errAbort("%s doesn't appear to be an .fa file\n", faLf->fileName); lineFileReuse(faLf); /* Loop around for each record of FA */ while (loadFaSeq(faLf, extFileId, *nextSeqId, seqTab, conn)) { (*nextSeqId)++; count++; } verbose(1, "%u sequences\n", count); lineFileClose(&faLf); } void hgLoadSeq(char *database, int fileCount, char *fileNames[]) /* Add a bunch of FA files to sequence and extFile tables of * database. */ { struct sqlConnection *conn; int i; FILE *seqTab; HGID firstSeqId = 0, nextSeqId = 0; if (!test) { conn = hgStartUpdate(database); char query[1024]; if (drop) { safef(query, sizeof(query), "drop table if exists %s", seqTbl); sqlUpdate(conn, query); safef(query, sizeof(query), "drop table if exists %s", extFileTbl); sqlUpdate(conn, query); } safef(query, sizeof(query), seqTableCreate, seqTbl); sqlMaybeMakeTable(conn, seqTbl, query); firstSeqId = nextSeqId = hgGetMaxId(conn, seqTbl) + 1; } verbose(1, "Creating %s.tab file\n", seqTbl); seqTab = hgCreateTabFile(".", seqTbl); for (i=0; i