PYTHON 7
FinalProject.py Guest on 29th July 2020 11:48:36 AM
  1. import sys
  2. import os
  3. import numpy as np
  4. import pandas as pd
  5. import seaborn as sns
  6. import matplotlib as mpl
  7. from pandas import DataFrame as df
  8.  
  9. import scipy
  10. from scipy.stats import zscore
  11. from sklearn import linear_model
  12. from matplotlib import pyplot as plt
  13.  
  14. reg = linear_model.LinearRegression()
  15.  
  16.  
  17. mpl.rcParams['pdf.fonttype'] = 42
  18. mpl.rcParams['ps.fonttype'] = 42
  19.  
  20.  
  21. __Author__ = 'Peihao Fan'
  22. __Date__ = 'Dec 1, 2019'
  23.  
  24.  
  25.  
  26. """
  27. Do the multivariable linear regression to know the coefficient of expressions of different genes representing different disease and pathways.
  28.  
  29. 1. prepare the data;
  30. 2. z score
  31.  
  32. """
  33.  
  34.  
  35. def bootsrapResample(x, y, n):
  36.  
  37.     '''
  38.    To get the confidence interval of coefficient, we need to use bootsrapping to get a series of coefficients.
  39.    Input:
  40.        x: list of gene expression data of coefficient genes.
  41.        y: list of gene expression data of target gene.
  42.        n: number of samples
  43.    Output:
  44.        resampled_x:
  45.        resampled_y:
  46.    '''
  47.  
  48.     resample_i = np.floor(np.random.rand(n) * n).astype(int)
  49.  
  50.     resampled_x = [x[i] for i in resample_i]
  51.     resampled_y = [y[i] for i in resample_i]
  52.  
  53.     return resampled_x, resampled_y
  54.  
  55.  
  56.  
  57.  
  58. def runReg(x, y, n):
  59.  
  60.     '''
  61.    Run the multivariable linear regression, using bootstrapping resampled samples.
  62.    Input:
  63.        x: list of gene expression data of coefficient genes.
  64.        y: list of gene expression data of target gene.
  65.        n: number of samples
  66.    Output:
  67.        beta_df: coefficients of gene expressions, for 1000 times.
  68.    '''
  69.  
  70.     beta_matrix = []
  71.  
  72.     for i in range(1000):
  73.  
  74.         # resampling
  75.         resampled_x, resampled_y = bootsrapResample(x=x, y=y, n=n)
  76.  
  77.         reg.fit(resampled_x, resampled_y)
  78.  
  79.         beta_matrix.append(reg.coef_)
  80.  
  81.     beta_df = df(beta_matrix, columns=sys.argv[2:])
  82.  
  83.  
  84.     return beta_df
  85.  
  86.  
  87.  
  88.  
  89. gene_list = sys.argv[1:]
  90. exp_df = df()
  91. for gene in gene_list:
  92.     exp_gene = pd.read_csv('mRNA expression (RNAseq)_ '+ gene +'.txt', index_col=0, header=0, sep='\t').T
  93.  
  94.     exp_df = pd.concat([exp_df, exp_gene], axis=1)
  95.  
  96. exp_df = exp_df.dropna()
  97.  
  98. exp_df = exp_df.apply(zscore)
  99.  
  100.  
  101.  
  102. y = list(exp_df[sys.argv[1]])
  103. x_gene_list = sys.argv[2:]
  104.  
  105. x = [[exp_df.loc[i, gene] for gene in x_gene_list] for i in exp_df.index]
  106.  
  107. n = exp_df.shape[0]
  108.  
  109. beta_df = runReg(x=x, y=y, n=n)
  110.  
  111. beta_df.to_csv('coefficients %s.csv' % str(sys.argv[1:]))
  112.  
  113.  
  114. beta_df = pd.read_csv(('coefficients %s.csv' % str(sys.argv[1:])),index_col=0, header=0, sep=',')
  115.  
  116. x_gene_list = sys.argv[2:]
  117.  
  118. plot_df = df()
  119. for gene in x_gene_list:
  120.     beta_gene = df(beta_df[gene])
  121.     beta_gene = beta_gene.rename(columns={gene: 'beta'})
  122.     beta_gene.loc[:, 'gene_name'] = gene
  123.     plot_df = pd.concat([plot_df, beta_gene], axis=0)
  124.  
  125. sns.stripplot(x="gene_name", y="beta",  data=plot_df, dodge=True,
  126.     jitter=True, palette=['skyblue','lightcoral'], alpha=1, size=5, zorder=0)
  127. sns.boxplot(x="gene_name", y="beta",  data=plot_df, dodge=True,
  128.     palette=['skyblue','lightcoral'], boxprops={'facecolor':'None'}, fliersize=2, zorder=2)
  129.  
  130. plt.show()

Paste is for source code and general debugging text.

Login or Register to edit, delete and keep track of your pastes and more.

Raw Paste

Login or Register to edit or fork this paste. It's free.