/* output a subset of lines of whose selected match . Assume source.txt and select.txt are sorted according to these fields. */ #include #include #include #include #include "string_tools.h" //parse and load fields from an input line according to a format string void parse_fields(char const* format_string, char const* line, void ** store, size_t nfields) { switch (nfields) { case 1: sscanf(line, format_string, store[0]); break; case 2: sscanf(line, format_string, store[0], store[1]); break; case 3: sscanf(line, format_string, store[0], store[1], store[2]); break; case 4: sscanf(line, format_string, store[0], store[1], store[2], store[3]); break; default: fprintf(stderr, "Error: parse_fields only handles up to 4 fields\n"); break; } } int compare(void const* a, void const* b, char typecode) { switch (typecode) { case 'i': { int anum = *static_cast(a); int bnum = *static_cast(b); return anum < bnum ? -1 : (anum == bnum ? 0 : 1); break; } case 'f': { float anum = *static_cast(a); float bnum = *static_cast(b); return anum < bnum ? -1 : (anum == bnum ? 0 : 1); break; } case 's': { char const* astr = static_cast(a); char const* bstr = static_cast(b); return strcmp(astr, bstr); break; } default: fprintf(stderr, "Error: unrecognized typecode '%c'\n", typecode); exit(1); break; } } int compare_all(void ** af, void ** bf, char const* typecode, size_t num_fields) { int cmp; for (size_t f = 0; f != num_fields; ++f) { cmp = compare(af[f], bf[f], typecode[f]); if (cmp != 0) { break; } } return cmp; } const size_t SOURCE_MAX_LINE = 1000000; const size_t SELECT_MAX_LINE = 1024; int main(int argc, char **argv) { if (argc == 1) { fprintf(stderr, "Usage: filter_matching_lines " " \n\n" " and are both printf-stype strings\n" "defining what fields to select or match from tabular text input files\n" " and \n" " is a string [ifs]+ (integer, float, or string)\n" ); return 0; } char * source_fname = argv[1]; char * source_fmt_raw = argv[2]; char * select_fname = argv[3]; char * select_fmt_raw = argv[4]; char * field_types = argv[5]; FILE * source_fh = fopen(source_fname, "r"); FILE * select_fh = fopen(select_fname, "r"); char source_fmt[100]; char select_fmt[100]; convert_escapes(source_fmt, source_fmt_raw); convert_escapes(select_fmt, select_fmt_raw); size_t num_fields = strlen(field_types); char * buffer = new char[num_fields * 200]; void ** source_fields = new void *[num_fields]; void ** select_fields = new void *[num_fields]; for (size_t f = 0; f != num_fields; ++f) { source_fields[f] = buffer + (100 * f); select_fields[f] = buffer + (100 * (f + num_fields)); } char source_line[SOURCE_MAX_LINE]; char select_line[SELECT_MAX_LINE]; //parse the fields of the source line while (fgets(select_line, SELECT_MAX_LINE, select_fh)) { parse_fields(select_fmt, select_line, select_fields, num_fields); while (1) { if (feof(source_fh)) { //ran out of source, didn't find matching line fprintf(stderr, "Erorr: source ran out of lines for selection line\n%s\n", select_line); exit(1); } fgets(source_line, SOURCE_MAX_LINE, source_fh); parse_fields(source_fmt, source_line, source_fields, num_fields); //do a short-circuiting 3-way comparison int cmp = compare_all(select_fields, source_fields, field_types, num_fields); if (cmp < 0) { //select < source. error: select should always be >= source. fprintf(stderr, "Error: source is missing matching line for selection line\n%s\n", select_line); exit(1); } else if (cmp == 0) { //select == source. line matches. print it. printf("%s", source_line); break; } else { //select > source. need to catch up by parsing more source lines. } } } delete buffer; delete source_fields; delete select_fields; }