PYTHON   77

dataset converter py

Guest on 25th June 2022 08:09:57 AM

  1. import numpy as np
  2. import math
  3.  
  4. #Global Variables
  5. vocab = ['S', 'T', 'Y']
  6. amino_acid = ['G', 'A', 'V', 'L', 'I', 'M', 'F', 'W', 'P', 'S', 'T', 'Y', 'C', 'N', 'Q', 'D', 'E', 'K', 'R', 'H']
  7. feature_size, context_window = 69, 10
  8. X = list()
  9.  
  10. #Converts the given sequence to a dataset using the features provided
  11. def converter(seq, feats):
  12.     amino_acid_count = dict()
  13.     for i in amino_acid:
  14.         amino_acid_count[i] = seq.count(i)+1
  15.     feats = [int(i) for i in feats.strip().split()]
  16.  
  17.     for i in range(len(seq)):
  18.         if seq[i] in vocab:
  19.             start, end = (i-context_window)*feature_size, (i+1+context_window)*feature_size
  20.             amino_acid_window = seq[0:int((end+(0-start))/feature_size)]
  21.             score = [amino_acid_window.count(k)/amino_acid_count[k] for k in amino_acid]
  22.             if start < 0:
  23.                 #Dynamic Window
  24.                 X.append(feats[0:end+(0-start)]+score+[len(seq)-i]+[i])
  25.             elif end > len(feats):
  26.                 X.append(feats[start-(end-len(feats)):len(feats)]+score+[len(seq)-i]+[i])
  27.             else:
  28.                 X.append(feats[start:end]+score+[len(seq)-i]+[i])
  29.    
  30.     return X

Raw Paste


Login or Register to edit or fork this paste. It's free.