#!/usr/bin/awk -f # # bcgscSageToBed -prefix=prefix mapFile ... >bed8 # # Convert long-SAGE sequences in files from BCGSC to BED8 # # 1 2 3 4 5 6 # TagSequence Tag count Contig NlaIII site Position of mapping on Contig strand # # -Position in file are 1-based, and indicate the location of start of CATG, # the tag sequence is down-stream of this (in the direction of transcription) # and does not contain the CATG. # - for + strand, the tag sequence starts is 4 bases after the position # - for - strand, the tag sequence ends is 4 bases before the position BEGIN { OFS = "\t"; prefix=""; for (i = 1; i <= ARGC; i++) { if (ARGV[i] !~ "^-") { break; } else if (ARGV[i] ~ "^-prefix=.*") { prefix=substr(ARGV[i], 9); ARGV[i] = ""; } else { print "Error: invalid option:",ARGV[i] >"/dev/stderr"; exit(1); } } } { contig = toupper($3); tagSeq = $1; tagCnt = $2; strand = $6; name = prefix NR; # score assumes log(tagCnt) <= 10, so score <= 1000 ## score = int(log(tagCnt)*100); # not really a browser score score = tagCnt; } strand=="+" { start = ($5-1); end = start+length(tagSeq)+4; thickStart = start; thickEnd = $5+3; } strand=="-" { start = ($5-1)-length(tagSeq)-3; end = $5; thickStart = end-4; thickEnd = end; } { print contig,start,end,name,score,strand,thickStart,thickEnd; }