C 10
Xblast.c Guest on 3rd October 2020 03:03:23 PM
  1.  
  2. #include <stdio.h>
  3. #include <string.h>
  4. /****************************************************************************/
  5. /*  xblast.c   2.0     Jean-Michel Claverie            Dec 92               */
  6. /*  module to read a BLAST output and to generate a filtered query          */
  7. /*  This version includes the -rd option                                    */
  8. /*  Default option:  " X " the matching part                                */
  9. /*           -r:  keep the matching part, "X" the rest                      */
  10. /*           -d:  build a db of each individual contiguous matching segment */
  11. /*    each defline is then annotated with the [positions]                   */
  12. /****************************************************************************/
  13.  
  14. #define MAXLINE 200
  15. #define DEFLIMIT 500
  16. #define SEQLIMIT 500000
  17. #define MASK 'x'
  18.  
  19. main (argc, argv)
  20. int argc;
  21. char *argv[];
  22. {
  23. int fgt_line(), gt_1stword(), gt_fasta();
  24. FILE *FIN, *QRY;
  25. char line[MAXLINE], *pline, word[MAXLINE], X= MASK;
  26. char defline[DEFLIMIT],defline2[DEFLIMIT],seq[SEQLIMIT], newseq[SEQLIMIT], *s, *ns;
  27. int c, i, pos1, pos2, size, OptR=0 , OptD=0, Opt=0, match;
  28.  
  29. /***********************  test arguments and usages *************************/
  30. if(argc<3 || argc >5)
  31.   {fprintf(stderr,"\n USAGE:   %s  [-r]  blast.output query    <X_char>\n",argv[0]);
  32.    fprintf(stderr,"\n   pipe | %s  [-r]       +       query    <X_char>\n",argv[0]);
  33.    fprintf(stderr,"          %s  [-d]  blast.output query    <X_char>\n",argv[0]);
  34.    fprintf(stderr,"   pipe | %s  [-d]       +       query    <X_char>\n",argv[0]);
  35.    exit(0);}
  36. /************************   fopen and tests  ********************************/
  37. if (!strcmp(argv[1],"-r"))   OptR=1;             /* reverse option set */
  38. if (!strcmp(argv[1],"-d"))   OptD=1;   /* reverse and divide options set */
  39.  
  40. Opt = OptR + OptD;                     /* Option offset */
  41. if (*argv[1+Opt]=='+') FIN=stdin;
  42. else if (NULL==(FIN=fopen(argv[1+Opt],"r")))
  43.          { fprintf(stderr," ERROR: opening  %s  ?\n",argv[1+OptR]); exit(0);}
  44.  
  45. if (NULL==(QRY=fopen(argv[2+Opt],"r")))
  46.          { fprintf(stderr," ERROR: opening  %s  ?\n",argv[2+Opt]); exit(0);}
  47. if (argc==4+Opt) X= (*argv[3+Opt]);
  48. /**************************** get fasta sequence in *************************/
  49. if (2>=(size=gt_1stfasta(QRY, defline, seq)))
  50.    { fprintf(stderr," ERROR: query  %s  ?\n",argv[2+Opt]); exit(0);}
  51.  
  52. /****************************************************************************/
  53.  
  54. while (1)               /*  skipping until Sequences */
  55.         {
  56.         c=fgt_line(FIN,MAXLINE,line);
  57.         if (c==EOF)
  58.            { fprintf(stderr," ERROR: bad BLAST output %s ?\n",argv[1]);
  59.              exit(0);
  60.            }
  61.         gt_1stword (line,word);
  62.         if(!strcmp("Sequences",word)) break; /* found Sequences */
  63.         }
  64.        
  65. if (!OptR && !OptD)     /*   default: X matching segments  */
  66.    {  
  67.    while (1)            /* Looking for Query line */
  68.         {
  69.         c=fgt_line(FIN,MAXLINE,line);
  70.         if (c==EOF)  break;
  71.         c=gt_1stword (line,word);
  72.         if(!strcmp("Query:",word))   /* found Query line */
  73.           {
  74.           pline=line+c;
  75.           c=gt_1stword (pline,word); sscanf(word,"%d",&pos1);
  76.           pline += c;
  77.           c=gt_1stword (pline,word); /* skip sequence */
  78.           pline += c;
  79.           c=gt_1stword (pline,word); sscanf(word,"%d",&pos2);
  80.  
  81.            if (pos1> pos2)
  82.                 { c= pos1; pos1= pos2; pos2= c;}
  83.  
  84.            if (pos1 < 1 || pos2 > size)
  85.           {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);}
  86.  
  87.           for (c=pos1-1; c<pos2; c++) seq[c]= X;
  88.  
  89.           }
  90.         }
  91.         printf("%s \n", defline);
  92.  
  93.         for (c=0, i=59; c<size ;  c++, i--)
  94.             { putchar(seq[c]);
  95.               if (!i) { putchar('\n'); i=60; }
  96.             }
  97.         putchar('\n');
  98.    }
  99.  
  100. if (OptR)        /*   keep matching segments, do not write empty seq   */
  101.    {
  102.  
  103.    for (s=seq, ns=newseq; *s != '\0'; s++, ns++) { *ns=*s, *s=X; }
  104.    *ns = '\0';
  105.    match=0;                     /*  No match yet  */
  106.  
  107.    while (1)            /* Looking for Query line */
  108.         {
  109.         c=fgt_line(FIN,MAXLINE,line);
  110.         if (c==EOF)  break;
  111.         c=gt_1stword (line,word);
  112.         if(!strcmp("Query:",word))   /* found Query line */
  113.           {
  114.           pline=line+c;
  115.           c=gt_1stword (pline,word); sscanf(word,"%d",&pos1);
  116.           pline += c;
  117.           c=gt_1stword (pline,word); /* skip sequence */
  118.           pline += c;
  119.           c=gt_1stword (pline,word); sscanf(word,"%d",&pos2);
  120.  
  121.            if (pos1> pos2)
  122.                 { c= pos1; pos1= pos2; pos2= c;}
  123.  
  124.            if (pos1 < 1 || pos2 > size)
  125.           {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);}
  126.  
  127.         for (c=pos1-1; c<pos2; c++) seq[c]= newseq[c];
  128.         match=1;     /* we now had one match, seq will be written out */
  129.           }
  130.         }
  131.        
  132.         if (match)
  133.            {
  134.            printf("%s \n", defline);
  135.            for (c=0, i=59; c<size ;  c++, i--)
  136.                { putchar(seq[c]);
  137.                  if (!i) { putchar('\n'); i=60; }
  138.                }
  139.                putchar('\n');
  140.            }
  141.    }
  142.  
  143. if (OptD)
  144.    {
  145.    /*  prepare the defline for this option */
  146.    c= gt_1stword (defline+1,word);
  147.    *defline = '>';
  148.    strcpy(defline2, defline+c+1);  /* keep the rest of the line */
  149.    strcpy(defline+1,word);
  150.  
  151.    for (s=seq, ns=newseq; *s != '\0'; s++, ns++) { *ns=*s, *s=X; }
  152.    *ns = '\0';
  153.    match=0;                     /*  No match yet  */
  154.  
  155.    while (1)            /* Looking for Query line */
  156.         {
  157.         c=fgt_line(FIN,MAXLINE,line);
  158.         if (c==EOF)  break;
  159.         c=gt_1stword (line,word);
  160.         if(!strcmp("Query:",word))   /* found Query line */
  161.           {
  162.           pline=line+c;
  163.           c=gt_1stword (pline,word); sscanf(word,"%d",&pos1);
  164.           pline += c;
  165.           c=gt_1stword (pline,word); /* skip sequence */
  166.           pline += c;
  167.           c=gt_1stword (pline,word); sscanf(word,"%d",&pos2);
  168.  
  169.            if (pos1> pos2)
  170.                 { c= pos1; pos1= pos2; pos2= c;}
  171.  
  172.            if (pos1 < 1 || pos2 > size)
  173.           {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);}
  174.  
  175.         for (c=pos1-1; c<pos2; c++) seq[c]= newseq[c];
  176.         match=1;     /* we now had one match, seq will be written out */
  177.           }
  178.         }
  179.        
  180.     if (match)
  181.        {
  182.        pos2=1; s=seq;
  183.        while (pos2 <= size)
  184.            {
  185.            for (pos1=pos2; pos1<=size; pos1++,s++) if (*s!= X)  break;
  186.            pos2 = pos1;
  187.            if (pos2 <= size)    /* not the end of current seq */
  188.               {
  189.               for (   ; pos2 <=size; pos2++, s++) if ( *s == X ) break;
  190.          
  191.               printf("%s_%d_%d %s\n",defline,pos1,pos2-1,defline2);
  192.  
  193.               for (c=pos1-1, i=59; c<pos2-1 ;  c++, i--)
  194.                   { putchar(seq[c]);
  195.                     if (!i) { putchar('\n'); i=60; }
  196.                   }
  197.               putchar('\n');
  198.               }
  199.            }
  200.        }
  201.    }        /*  end option -d   */
  202.  
  203. exit(0);
  204. }
  205.  
  206. /* ---------------------------------------------------------------------*/
  207. /* fgt_line(Fin, max,line)              JMC jan 1991       SGI          */
  208. /* #include <stdio.h>                                                   */
  209. /* function to input a line from an opened steam file into a char array,*/
  210. /* with checking on the max length < max                                */
  211. /* file "records" must be delimited by '\n'                             */
  212. /* string is filled out until \n is encountered, or max-1 chars read    */
  213. /* or EOF occurs                                                        */
  214. /* arguments:                                                           */
  215. /*      Fin: FILE pointer to input stream                               */
  216. /*      max: array dimension [], max length including the final \0      */
  217. /*     line: string or char *                                           */
  218. /* return: the number of usable chars in the string (\0 not included)   */
  219. /*       : or max if capacity was exceeded, line contains               */
  220. /*                max-1 usable characters.                              */
  221. /*       : 0 if NEW LINE was encountered first, line starts with '\0'   */
  222. /*       :-1 if EOF was encountered, line starts with '\0'              */
  223. /* -------------------------------------------------------------------- */
  224.  
  225. fgt_line (Fin,max,line)
  226. int max;
  227. char line[];
  228. FILE *Fin;
  229.  
  230. {
  231. int c, i;
  232.  
  233. for (i=0; (c=fgetc(Fin))!= EOF && c!='\n' && i<(max-1);i++) line[i]=c;
  234.  
  235. line[i]='\0';           /* replace NEW LINE or last by '\0', don't count */
  236.  
  237. if (c==EOF) return(-1);                         /* EOF encountered */
  238.        
  239. if (i==(max-1) && c!='\n') { while (fgetc(Fin)!='\n');     /* clean buffer */
  240.                            return(max); }               /* flag overflow   */
  241.         return (i);
  242. }  
  243.  
  244. /* sgt_line do the same things from a string */
  245.  
  246.  
  247. /* -------------------------------------------------------------------  */
  248. /* int gt_1stword()         Jean-Michel Claverie      feb 1991    SGI   */
  249. /* module to read the first word of a closed string                     */
  250. /* ARGUMENTS                                                            */
  251. /* IN:   line, a closed string                                          */
  252. /* OUT:  word, a closed string,  WITH sufficient space allocation       */
  253. /* RETURN: next: the relative position of the first characters NOT      */
  254. /*         belonging to the first word in the string                    */
  255. /*         thus, no more word in the string returns   0 (false)         */
  256. /* Note : the next call can then use : line + next, as a pointer to     */
  257. /*        initiate the search for the next 1st word in the string       */
  258. /* -------------------------------------------------------------------- */
  259.                
  260. int gt_1stword (line,word)
  261. char *line, *word;
  262. {
  263. char *p;
  264. for ( p=line; isspace(*p) ; p++);  /* skeep leading blanks */
  265. if (*p == '\0') return (0) ;            /* no word in line */
  266.  
  267. while (!isspace(*p) && *p!='\0'){ *word =(*p) ; word++; p++;}
  268. *word='\0';
  269. return (p-line);
  270. }
  271.  
  272. /********************************************************************** */
  273. /* int gt_1stfasta ()    jean-Michel Claverie       feb 92  SGI         */
  274. /* module to load the 1st defline and sequence from a fasta format file */
  275. /* (DEFLIMIT-1)  chars of the first line are stored                     */
  276. /* Definition line  must start with a " > "                             */
  277. /* Sequence size input limited at SEQLIMIT-1                            */
  278. /* in:                                                                  */
  279. /* *Fin : "r" opened file (fasta db)                                    */
  280. /* char *defline, char *seq : char arrays for defline, sequence         */
  281. /* out:                                                                 */  
  282. /* defline as a closed string,                                          */
  283. /* seq as a closed string, no blank, lower or upper                     */
  284. /* return: sequence size or EOF (-1) if no more entry                   */
  285. /********************************************************************** */
  286.  
  287. int gt_1stfasta(Fin, defline, seq)
  288.  
  289. FILE *Fin;
  290. char *defline, *seq;
  291. {
  292. int i, c, size;
  293. char *in;    
  294. /* loading all definition line up to deflimit */
  295.  
  296.  if (EOF==(c=fgetc(Fin))) return c;              /* end of fasta file */
  297.  if (c!='>')
  298.     { fprintf(stderr,"\n ERROR: missing '>' in def line\n");
  299.       exit(0);
  300.     }
  301.  
  302.        *defline='>';
  303.        for (i=DEFLIMIT-2, in=defline+1 ; i; i--, in++)
  304.     { c=fgetc(Fin);
  305.       if (c==EOF)
  306.          {fprintf(stderr,"\n ERROR: unexpected EOF in def line\n");
  307.            exit(0);
  308.          }
  309.       *in = c;
  310.       if (*in=='\n') break;   /* end of def line reached  */
  311.     }
  312.  
  313. if (!i)         /*  DEFLIMIT was reached, go find end of line  */
  314.    {
  315.    while (EOF !=(c=fgetc(Fin)))  if (c=='\n') break;
  316.       if (c==EOF)
  317.       {fprintf(stderr,"\n ERROR: unexpected EOF in def line %s\n", defline);
  318.        exit(0);
  319.       }
  320.    }
  321.  
  322.  *in= '\0';      /* defline closure  */
  323.  
  324. /* loading sequence up to EOF, next '>' or SEQLIMIT     */
  325.  
  326. i=SEQLIMIT-1;
  327. size=0;
  328. while (EOF != (c=fgetc(Fin)) && c !='>')
  329.       {
  330.       if (!isspace (c)) { *seq =c; seq++; i--; size++; }
  331.       if (!i)
  332.          { fprintf (stderr,"\n ERROR: seq overflow %s %d \n", defline, size);
  333.            exit(0);
  334.          }
  335.       }
  336.  
  337. *seq='\0'; ungetc(c,Fin);
  338. return size;
  339.  
  340. }

Paste is for source code and general debugging text.

Login or Register to edit, delete and keep track of your pastes and more.

Raw Paste

Login or Register to edit or fork this paste. It's free.