/* * $Id: getsize.c 4278 2009-12-23 09:58:37Z james_johnson $ * * $Log$ * */ /*********************************************************************** * * * MEME * Copyright 1994--2009, The Regents of the University of California * * Author: Timothy L. Bailey * * * ***********************************************************************/ /* getsize.c */ /* getsize [options] Reads in a sequence file (Pearson/FASTA format). Prints the number of sequences, min L, max L, mean L, total residues and letters in alphabet used. Type "getsize" to see options. */ #define DEFINE_GLOBALS #include "read_sequence.h" #include "general.h" #include "hash_table.h" #include "hash_alph.h" #define DATA_HASH_SIZE 100000 #define LDNAB 30 /**********************************************************************/ /* main */ /**********************************************************************/ extern int main( int argc, char *argv[] ) { long i, j, k; char *datafile = NULL; FILE *data_file; int n_samples; long max_slength, min_slength, length; char *sample_name, *id, *seq; int bfile = 0; /* do not print bfile format */ BOOLEAN l_only = FALSE; /* do not print lengths only */ BOOLEAN print_duplicates = TRUE; /* print names of duplicates */ BOOLEAN print_frequencies = FALSE; /* do not print letter frequencies */ BOOLEAN print_table = FALSE; /* do not print frequencies table */ BOOLEAN xlate_dna = FALSE; /* do not translate DNA */ BOOLEAN print_codons = FALSE; /* do not print codon usage */ HASH_TABLE ht_seq_names; /* hash of dataset seq names */ double codons[LDNAB][LDNAB][LDNAB]; /* counts of used codons */ double alphabet[6][MAXASCII]; /* counts of used letters (per frame) */ double total_res[6]; /* total letters per frame */ char *letters = NULL; /* string of used letters */ /* set name of command */ argv[0] = "getsize"; /* get the command line arguments */ i = 1; DO_STANDARD_COMMAND_LINE(1, USAGE( [options]); NON_SWITCH(1, \n\t\t\tfile containing sequences in FASTA format\n, switch (i++) { case 1: datafile = _OPTION_; break; default: COMMAND_LINE_ERROR; }); FLAG_OPTN(1, l, \t\t\tjust print the length of each sequence, l_only = TRUE); FLAG_OPTN(1, nd, \t\t\tdo not print warnings about duplicate sequences, print_duplicates = FALSE); FLAG_OPTN(1, dna, \t\t\tprint dna frequencies in bfile format, bfile = 1); FLAG_OPTN(1, prot, \t\t\tprint protein frequencies in bfile format, bfile = 2); FLAG_OPTN(1, f, \t\t\tprint letter frequencies, print_frequencies = TRUE); FLAG_OPTN(1, ft, \t\t\tprint letter frequencies as latex table, print_table = TRUE); FLAG_OPTN(1, x, \t\t\ttranslate DNA in six frames, xlate_dna = TRUE); FLAG_OPTN(1, codons, \t\tprint frame0 codon usage (implies -f xlate_dna), print_codons = xlate_dna = print_frequencies = TRUE); USAGE(\n\tMeasure statistics of a FASTA file.); ) /* Setup hashing function for encoding strings as integers. If translating from DNA to protein, input alphabet must be DNAB; otherwise it may be all 26 letters plus asterisk. */ if (xlate_dna) { setup_hash_alph(DNAB); /* DNAB to position hashing */ setup_hash_alph(PROTEINB); /* PROTEINB to position hash */ setup_hash_dnab2protb(); /* DNAB to PROTEINB hashing */ } else if (bfile == 1) { /* get DNA frequencies */ setup_hash_alph(DNAB); /* DNAB to position hashing */ } else if (bfile == 2) { /* get PROT frequencies */ setup_hash_alph(PROTEINB); /* PROTEINB to position hash */ } else { setup_hash_alph("ABCDEFGHIJKLMNOPQRSTUVWXYZ*-"); /* full alphabet */ } Resize(letters, MAXASCII, char); /* create a hash table of sequence names */ ht_seq_names = hash_create(DATA_HASH_SIZE); /* open data file */ if (datafile == NULL) { fprintf(stderr, "You must specify a data file or 'stdin'\n"); exit(1); } else if ((strcmp(datafile, "stdin") == 0) || (strcmp(datafile, "-") == 0)) { data_file = stdin; } else { data_file = fopen(datafile, "r"); if (data_file == NULL) { fprintf(stderr, "Cannot open file '%s'\n", datafile); exit(1); } } /* initialize counts of letters and codons used */ for (i=0; i<6; i++) for (j=0; j