PYTHON 28
###### FinalProject.py Guest on 29th July 2020 11:48:36 AM
1. import sys
2. import os
3. import numpy as np
4. import pandas as pd
5. import seaborn as sns
6. import matplotlib as mpl
7. from pandas import DataFrame as df
8.
9. import scipy
10. from scipy.stats import zscore
11. from sklearn import linear_model
12. from matplotlib import pyplot as plt
13.
14. reg = linear_model.LinearRegression()
15.
16.
17. mpl.rcParams['pdf.fonttype'] = 42
18. mpl.rcParams['ps.fonttype'] = 42
19.
20.
21. __Author__ = 'Peihao Fan'
22. __Date__ = 'Dec 1, 2019'
23.
24.
25.
26. """
27. Do the multivariable linear regression to know the coefficient of expressions of different genes representing different disease and pathways.
28.
29. 1. prepare the data;
30. 2. z score
31.
32. """
33.
34.
35. def bootsrapResample(x, y, n):
36.
37.     '''
38.    To get the confidence interval of coefficient, we need to use bootsrapping to get a series of coefficients.
39.    Input:
40.        x: list of gene expression data of coefficient genes.
41.        y: list of gene expression data of target gene.
42.        n: number of samples
43.    Output:
44.        resampled_x:
45.        resampled_y:
46.    '''
47.
48.     resample_i = np.floor(np.random.rand(n) * n).astype(int)
49.
50.     resampled_x = [x[i] for i in resample_i]
51.     resampled_y = [y[i] for i in resample_i]
52.
53.     return resampled_x, resampled_y
54.
55.
56.
57.
58. def runReg(x, y, n):
59.
60.     '''
61.    Run the multivariable linear regression, using bootstrapping resampled samples.
62.    Input:
63.        x: list of gene expression data of coefficient genes.
64.        y: list of gene expression data of target gene.
65.        n: number of samples
66.    Output:
67.        beta_df: coefficients of gene expressions, for 1000 times.
68.    '''
69.
70.     beta_matrix = []
71.
72.     for i in range(1000):
73.
74.         # resampling
75.         resampled_x, resampled_y = bootsrapResample(x=x, y=y, n=n)
76.
77.         reg.fit(resampled_x, resampled_y)
78.
79.         beta_matrix.append(reg.coef_)
80.
81.     beta_df = df(beta_matrix, columns=sys.argv[2:])
82.
83.
84.     return beta_df
85.
86.
87.
88.
89. gene_list = sys.argv[1:]
90. exp_df = df()
91. for gene in gene_list:
92.     exp_gene = pd.read_csv('mRNA expression (RNAseq)_ '+ gene +'.txt', index_col=0, header=0, sep='\t').T
93.
94.     exp_df = pd.concat([exp_df, exp_gene], axis=1)
95.
96. exp_df = exp_df.dropna()
97.
98. exp_df = exp_df.apply(zscore)
99.
100.
101.
102. y = list(exp_df[sys.argv[1]])
103. x_gene_list = sys.argv[2:]
104.
105. x = [[exp_df.loc[i, gene] for gene in x_gene_list] for i in exp_df.index]
106.
107. n = exp_df.shape[0]
108.
109. beta_df = runReg(x=x, y=y, n=n)
110.
111. beta_df.to_csv('coefficients %s.csv' % str(sys.argv[1:]))
112.
113.
115.
116. x_gene_list = sys.argv[2:]
117.
118. plot_df = df()
119. for gene in x_gene_list:
120.     beta_gene = df(beta_df[gene])
121.     beta_gene = beta_gene.rename(columns={gene: 'beta'})
122.     beta_gene.loc[:, 'gene_name'] = gene
123.     plot_df = pd.concat([plot_df, beta_gene], axis=0)
124.
125. sns.stripplot(x="gene_name", y="beta",  data=plot_df, dodge=True,
126.     jitter=True, palette=['skyblue','lightcoral'], alpha=1, size=5, zorder=0)
127. sns.boxplot(x="gene_name", y="beta",  data=plot_df, dodge=True,
128.     palette=['skyblue','lightcoral'], boxprops={'facecolor':'None'}, fliersize=2, zorder=2)
129.
130. plt.show()

Paste is for source code and general debugging text.

Login or Register to edit, delete and keep track of your pastes and more.

Recent Pastes

Raw Paste

or to edit or fork this paste. It's free.