/* gfClient - A client for the genomic finding program that produces a .psl file. */ /* Copyright 2001-2003 Jim Kent. All rights reserved. */ #include "common.h" #include "linefile.h" #include "aliType.h" #include "fa.h" #include "genoFind.h" #include "psl.h" #include "options.h" #include "fuzzyFind.h" static struct optionSpec optionSpecs[] = { {"prot", OPTION_BOOLEAN}, {"q", OPTION_STRING}, {"t", OPTION_STRING}, {"minIdentity", OPTION_FLOAT}, {"minScore", OPTION_INT}, {"dots", OPTION_INT}, {"out", OPTION_STRING}, {"maxIntron", OPTION_INT}, {"nohead", OPTION_BOOLEAN}, {NULL, 0} }; /* Variables that can be overridden by command line. */ int dots = 0; int minScore = 30; double minIdentity = 90; char *outputFormat = "psl"; char *qType = "dna"; char *tType = "dna"; void usage() /* Explain usage and exit. */ { printf( "gfClient v. %s - A client for the genomic finding program that produces a .psl file\n" "usage:\n" " gfClient host port seqDir in.fa out.psl\n" "where\n" " host is the name of the machine running the gfServer\n" " port is the same as you started the gfServer with\n" " seqDir is the path of the .nib or .2bit files relative to the current dir\n" " (note these are needed by the client as well as the server)\n" " in.fa is a fasta format file. May contain multiple records\n" " out.psl where to put the output\n" "options:\n" " -t=type Database type. Type is one of:\n" " dna - DNA sequence\n" " prot - protein sequence\n" " dnax - DNA sequence translated in six frames to protein\n" " The default is dna\n" " -q=type Query type. Type is one of:\n" " dna - DNA sequence\n" " rna - RNA sequence\n" " prot - protein sequence\n" " dnax - DNA sequence translated in six frames to protein\n" " rnax - DNA sequence translated in three frames to protein\n" " -prot Synonymous with -d=prot -q=prot\n" " -dots=N Output a dot every N query sequences\n" " -nohead Suppresses psl five line header\n" " -minScore=N sets minimum score. This is twice the matches minus the \n" " mismatches minus some sort of gap penalty. Default is 30\n" " -minIdentity=N Sets minimum sequence identity (in percent). Default is\n" " 90 for nucleotide searches, 25 for protein or translated\n" " protein searches.\n" " -out=type Controls output file format. Type is one of:\n" " psl - Default. Tab separated format without actual sequence\n" " pslx - Tab separated format with sequence\n" " axt - blastz-associated axt format\n" " maf - multiz-associated maf format\n" " sim4 - similar to sim4 format\n" " wublast - similar to wublast format\n" " blast - similar to NCBI blast format\n" " blast8- NCBI blast tabular format\n" " blast9 - NCBI blast tabular format with comments\n" " -maxIntron=N Sets maximum intron size. Default is %d\n", gfVersion, ffIntronMaxDefault); exit(-1); } struct gfOutput *gvo; void gfClient(char *hostName, char *portName, char *tSeqDir, char *inName, char *outName, char *tTypeName, char *qTypeName) /* gfClient - A client for the genomic finding program that produces a .psl file. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); static bioSeq seq; FILE *out = mustOpen(outName, "w"); enum gfType qType = gfTypeFromName(qTypeName); enum gfType tType = gfTypeFromName(tTypeName); int dotMod = 0; char databaseName[256]; struct hash *tFileCache = gfFileCacheNew(); snprintf(databaseName, sizeof(databaseName), "%s:%s", hostName, portName); gvo = gfOutputAny(outputFormat, round(minIdentity*10), qType == gftProt, tType == gftProt, optionExists("nohead"), databaseName, 23, 3.0e9, minIdentity, out); gfOutputHead(gvo, out); while (faSomeSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name, qType != gftProt)) { int conn = gfConnect(hostName, portName); if (dots != 0) { if (++dotMod >= dots) { dotMod = 0; fputc('.', stdout); fflush(stdout); } } if (qType == gftProt && (tType == gftDnaX || tType == gftRnaX)) { gvo->reportTargetStrand = TRUE; gfAlignTrans(&conn, tSeqDir, &seq, minScore, tFileCache, gvo); } else if ((qType == gftRnaX || qType == gftDnaX) && (tType == gftDnaX || tType == gftRnaX)) { gvo->reportTargetStrand = TRUE; gfAlignTransTrans(&conn, tSeqDir, &seq, FALSE, minScore, tFileCache, gvo, qType == gftRnaX); if (qType == gftDnaX) { reverseComplement(seq.dna, seq.size); close(conn); conn = gfConnect(hostName, portName); gfAlignTransTrans(&conn, tSeqDir, &seq, TRUE, minScore, tFileCache, gvo, FALSE); } } else if ((tType == gftDna || tType == gftRna) && (qType == gftDna || qType == gftRna)) { gfAlignStrand(&conn, tSeqDir, &seq, FALSE, minScore, tFileCache, gvo); conn = gfConnect(hostName, portName); reverseComplement(seq.dna, seq.size); gfAlignStrand(&conn, tSeqDir, &seq, TRUE, minScore, tFileCache, gvo); } else { errAbort("Comparisons between %s queries and %s databases not yet supported", qTypeName, tTypeName); } gfOutputQuery(gvo, out); } if (out != stdout) printf("Output is in %s\n", outName); gfFileCacheFree(&tFileCache); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc != 6) usage(); if (optionExists("prot")) qType = tType = "prot"; qType = optionVal("q", qType); tType = optionVal("t", tType); if (sameWord(tType, "prot") || sameWord(tType, "dnax") || sameWord(tType, "rnax")) minIdentity = 25; minIdentity = optionFloat("minIdentity", minIdentity); minScore = optionInt("minScore", minScore); dots = optionInt("dots", 0); outputFormat = optionVal("out", outputFormat); /* set global for fuzzy find functions */ setFfIntronMax(optionInt("maxIntron", ffIntronMaxDefault)); gfClient(argv[1], argv[2], argv[3], argv[4], argv[5], tType, qType); return 0; }