查看原文
其他

s06 - Consensus and Profile

Y叔 biobabble 2020-02-05

Problem

A matrix is a rectangular table of values divided into rows and columns. An m×n matrix has m rows and n columns. Given a matrix A, we write Ai,j to indicate the value found at the intersection of row i and column j.

Say that we have a collection of DNA strings, all having the same length n. Their profile matrix is a 4×n matrix P in which P1,j represents the number of times that ‘A’ occurs in the jth position of one of the strings, P2,j represents the number of times that C occurs in the jth position, and so on (see below).

A consensus string c is a string of length n formed from our collection by taking the most common symbol at each position; the jth symbol of c therefore corresponds to the symbol having the maximum value in the j-th column of the profile matrix. Of course, there may be more than one most common symbol, leading to multiple possible consensus strings.

           A T C C A G C T            G G G C A A C T            A T G G A T C T DNA Strings A A G C A A C C            T T G G A A C T            A T G C C A T T            A T G G C A C T            A   5 1 0 0 5 5 0 0 Profile     C   0 0 1 4 2 0 6 1            G   1 1 6 3 0 1 0 0            T   1 5 0 0 0 1 1 6 Consensus    A T G C A A C T

Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)

Sample Dataset

>Rosalind_1 ATCCAGCT >Rosalind_2 GGGCAACT >Rosalind_3 ATGGATCT >Rosalind_4 AAGCAACC >Rosalind_5 TTGGAACT >Rosalind_6 ATGCCATT >Rosalind_7 ATGGCACT

Sample Output

ATGCAACT A: 5 1 0 0 5 5 0 0 C: 0 0 1 4 2 0 6 1 G: 1 1 6 3 0 1 0 0 T: 1 5 0 0 0 1 1 6

Solution

这道题给出alignment,要求生成一个碱基的频数表,并给出consensus序列。

C version

在Computing GC content这一道题中就引入了FASTA格式,但那道题太简单,根本不需要存储序列,而这道题不同,我们需要考虑到序列的解析存储问题。这道题虽然说FASTA的描述行和序列行在不同序列中都是等长的,但现实中的FASTA文件并非如此,我们需要考虑到这一点,并实现一个通用的解析函数。这里再次使用了链表,最终读出来的序列,也是以链表形式返回,当然我们很容易写个wrapper function,返回SEQ结构体的指针构成的向量。

首先定义了readFASTA.h

typedef struct SEQ {
 char * desc;  
 char * seq;  
 int width;  
 struct SEQ * next; } SEQ;

typedef
struct CHLL {  // CHaracter Link-List  char ch;
 struct CHLL * next; } CHLL; SEQ * readFASTA(char *filename); SEQ * readFASTA(char *filename) {  FILE *INFILE;  INFILE = fopen(filename, "r");  SEQ *head = malloc(sizeof(SEQ));  SEQ *curr;  curr = head;
 char ch;  
 int desc_line = 0, i;  
 while( (ch = fgetc(INFILE)) != EOF) {    SEQ *SEQ_NODE = malloc(sizeof(SEQ));    CHLL *CHLL_head = malloc(sizeof(CHLL));    CHLL *CHLL_curr;    CHLL_curr = CHLL_head;    // description line    if ( ch == '>') {      desc_line = 1;    }
   if (desc_line == 1) {      i = 0;
     while( (ch = fgetc(INFILE) ) != '\n') {          CHLL *CHLL_NODE = malloc(sizeof(CHLL));          i++;          CHLL_NODE->ch = ch;          CHLL_curr->next = CHLL_NODE;          CHLL_curr = CHLL_NODE;      }    }    
   int desc_width = i;    CHLL_curr = CHLL_head->next;
   char* desc = malloc(sizeof(char) * desc_width);    
   for (i=0; i < desc_width; i++) {      desc[i] = CHLL_curr->ch;      CHLL_curr = CHLL_curr->next;    }    SEQ_NODE->desc = desc;    // sequence lines    // re-initial the CHLL link list to store sequence characters    CHLL_curr = CHLL_head;    desc_line = 0;    i = 0;    
   while( (ch=fgetc(INFILE)) != '>' && ch != EOF) {
     if (ch == '\n') {
         continue;      }      CHLL *CHLL_NODE = malloc(sizeof(CHLL));      CHLL_NODE->ch = ch;      CHLL_curr->next = CHLL_NODE;      CHLL_curr = CHLL_NODE;      i++;    }
   int seq_width = i;    
   char * seq = malloc(sizeof(char)*seq_width);    CHLL_curr = CHLL_head->next;    
   for (i=0; i < seq_width; i++) {      seq[i] = CHLL_curr->ch;      CHLL_curr = CHLL_curr->next;    }    SEQ_NODE->seq = seq;    SEQ_NODE->width = seq_width;    curr->next = SEQ_NODE;    curr = SEQ_NODE;    desc_line = 1;  }  
 return head; }

有了FASTA的解析函数之后,问题就容易了,无非是读文件,核苷酸计数,最近打印结果。

#include<stdio.h> #include<stdlib.h> #include "include/readFASTA.h" int main() {  SEQ * head, *curr;  head = readFASTA("DATA/rosalind_cons.txt");  curr = head;  int width=0;  while(curr = curr->next) {    if (width < curr->width) {      width = curr->width;    }  }  int NT_type = 4; // ACGT  int cons[NT_type][width];  int r, c;  for (r=0; r < NT_type; r++) {    for (c=0; c < width; c++) {      cons[r][c] = 0;    }  }  curr=head;  int i;  while(curr = curr->next) {    for (i=0; i < curr->width; i++) {      switch((curr->seq)[i]) {      case 'A':          cons[0][i]++;          break;      case 'C':          cons[1][i]++;          break;      case 'G':          cons[2][i]++;          break;      case 'T':          cons[3][i]++;          break;      }    }  }  char NT[4] = "ACGT";  int max_r_idx, max_r=0;  for (c=0; c < width; c++) {    for (r=0; r < NT_type; r++) {      if (max_r < cons[r][c]) {          max_r = cons[r][c];          max_r_idx = r;      }    }    printf("%c", NT[max_r_idx]);    max_r = 0;  }  printf("\n");  for (r=0; r < NT_type; r++) {    printf("%c: ", NT[r]);    for (c=0; c < width; c++) {      printf("%d ", cons[r][c]);    }    printf("\n");  } }

Python version

这里我读序列就是看到>就证明上一条序列读完了,可以更新计数了。
而这个更新,主要是zip函数稍微难懂一点,str是序列,而list是列表,str如果是char一样就是1不然是0,所以 [1 if x == char else 0 for x in str] 这一句根据str生成了一个0,1的列表,这个列表当然也和list等长,用zip对两个列表同时进行迭代,并加和,这样就更新了计数。

#!/usr/bin/env python3

def update_cnt(list, char, str):    res = [a + b for a,b in zip(list, [1 if x == char else 0 for x in str])]    return(res)def print_cnt(list, label):    print(label, end = ": ")
   for x in list:        print(x, end=" ")    print() f = open("DATA/rosalind_cons.txt", 'r') fas = f.readlines() seq = ''
j = 0
for i in range(len(fas)):
   if fas[i][0] == '>' and j == 0:        j += 1        next    
   elif fas[i][0] == '>':
       if j == 1:            A = [0 for x in seq]            C = A            G = A            T = A        A = update_cnt(A, 'A', seq)        C = update_cnt(C, 'C', seq)        G = update_cnt(G, 'G', seq)        T = update_cnt(T, 'T', seq)        j += 1        seq = ''    else:        seq += fas[i].strip()

## last seq
A = update_cnt(A, 'A', seq) C = update_cnt(C, 'C', seq) G = update_cnt(G, 'G', seq) T = update_cnt(T, 'T', seq) consensus = ''
for i in range(len(A)):    m = max(A[i], C[i], G[i], T[i])
   if A[i] == m:        consensus += 'A'    elif C[i] == m:        consensus += 'C'    elif G[i] == m:        consensus += 'G'    elif T[i] == m:        consensus += 'T'

print(consensus) print_cnt(A, 'A') print_cnt(C, 'C') print_cnt(G, 'G') print_cnt(T, 'T')

    您可能也对以下帖子感兴趣

    文章有问题?点此查看未经处理的缓存