scRNA_seq/analysisFunctions.py at main · ewesteinde/scRNA_seq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 10 16:56:15 2024

@author: elenawesteinde
"""

def findAveGeneExpPerCluster(adata):
    # Import modules
    import pandas as pd

    gene_ids = adata.raw.var.index.values
    clusters = adata.obs['leiden'].cat.categories
    obs = adata.raw[:,gene_ids].X.toarray()
    obs = pd.DataFrame(obs,columns = gene_ids,index=adata.obs['leiden'])
    average_obs = obs.groupby(level=0).mean()
    obs_bool = obs.astype(bool) # any exp vs none
    fraction_obs = obs_bool.groupby(level=0).sum()/obs_bool.groupby(level=0).count()
    average_obs.T.to_csv("average.csv")
    fraction_obs.T.to_csv("fraction.csv")

    return gene_ids, clusters, obs, average_obs, obs_bool, fraction_obs

def makeResultsTable(adata):
    import pandas as pd
    results = adata.uns['rank_genes_groups']
    groups = results['names'].dtype.names
    results_table = pd.DataFrame({group + '_' + key[:1]: results[key][group]
        for group in groups for key in ['names', 'scores']})

    return results_table


def geneScoresPerCluster(results_table, gene_ids, clusters):
    import numpy as np
    import pandas as pd
    genes_ordered = pd.DataFrame()
    scores_ordered = pd.DataFrame()
    for col in results_table.columns:
        group = col[0:-2]
        if 'n' in col:
            genes_ordered[group] = results_table[col]
        if 's' in col:
            scores_ordered[group] = results_table[col]

    geneScore_cluster = pd.DataFrame(index = np.sort(gene_ids), columns = clusters)

    idx = list(range(len(gene_ids)))
    for col in genes_ordered:
        cluster_genes = np.array(genes_ordered[col])
        sort_index = np.argsort(cluster_genes)
        geneScore_cluster[col].iloc[idx] = scores_ordered[col][sort_index]

    return geneScore_cluster

def renameClusters(adata, toRename, newNames, toPlot):
    import scanpy as sc
    numClusters = len(adata.obs['leiden'].unique())
    if max(toRename) <= numClusters:
        cluster_dic = adata.uns['cluster_dic']
        cluster_dic.update({key:newNames[idx] for idx, key in enumerate(toRename)})
        new_cluster_names = list(cluster_dic.values())
        adata.rename_categories('leiden', new_cluster_names)
        adata.uns['cluster_dic'] = cluster_dic
        sc.tl.rank_genes_groups(adata,'leiden',method='wilcoxon')
    else:
        print('Tried to rename a nonexistant cluster')

    if toPlot:
        import scanpy as sc
        sc.pl.umap(adata, color = ['leiden'], frameon = False, legend_loc = 'on data', legend_fontsize = 'x-small')

def plotGeneScoresAcrossClusters(genesOfInterest,geneScore_cluster):
    import math
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    numYticks = 10
    numSubplots = len(genesOfInterest)
    numCol = 4
    if numCol > numSubplots:
        numCol = numSubplots
    numRow = math.ceil(numSubplots/numCol)
    fig,ax = plt.subplots(numRow,numCol, figsize = (10,5))
    plt.tight_layout()
    fig.set_figwidth(10)


    count = 0
    for row in range(numRow):
        for col in range(numCol):
            if count < numSubplots:
                gene = genesOfInterest[count]
                geneScores = geneScore_cluster.loc[gene]
                clusterNames = geneScore_cluster.columns
                orderedIdx = geneScores.argsort()
                orderedScores = geneScores[orderedIdx]
                orderedClusters = clusterNames[orderedIdx]
                if numRow == 1:
                    ax[col].scatter(x = orderedClusters, y = orderedScores, s = 3)
                    ax[col].set_title(gene)
                    ax[col].set_xticks([])
                    yticks = mpl.ticker.MaxNLocator(numYticks)
                    ax[col].yaxis.set_major_locator(yticks)
                else:
                    ax[row,col].scatter(x = orderedClusters, y = orderedScores, s = 3)
                    ax[row,col].set_title(gene)
                    ax[row,col].set_xticks([])
                    yticks = mpl.ticker.MaxNLocator(numYticks)
                    ax[row,col].yaxis.set_major_locator(yticks)
            else:
                fig.delaxes(ax[row,col])
            count += 1