Dissertation_ScRNA-seq/Biological_System_Categorisation.R at master · rnbinama/Dissertation_ScRNA-seq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
library(dplyr)
library(tidyverse)

# Read in downloaded PubMed search results with 800 manully labelled pharmacology focussed areas
Search_data <- read_csv("topic_and_wo.csv")

# Looking for key terms
key_terms_search <- Search_data %>%
  filter(str_detect(PHARMACOLOGY.TOPIC, "immune")) %>% #Pharmacology topic areas manually assigned in Excel
  select(PHARMACOLOGY.TOPIC, Title)


hepatic <- grep("(liver)|(hepat)|(Kupffer)", papers_joined$Title, ignore.case = TRUE)
neuroscience <- grep("(neur)|(brain)|(glia)|(astrocyte)|(inhibit)|(sleep)|(activation)|(CNS)|(circuit)|(cortical)|(cerebral)|(plasticity)|(opioid)|(signal)|(nervous)|(noci)|(audi)", papers_joined$Title, ignore.case = TRUE)
respiratory <- grep("(lung)|(respir)|(airwa)|(Bronch)|(asthma)", papers_joined$Title, ignore.case = TRUE)
immune <- grep("(T cell)|(immun)|(inflam)|(macrophage)|(B cell)|(vaccine)|(lymph)|(myeloma)|(antiv|antib)|(NK)", papers_joined$Title, ignore.case = TRUE)
gastrointestinal <- grep("(gastr)|(intest)|(gut)|(Smooth muscle)|(colo)|(enterochromaffin)|(esophag)", papers_joined$Title, ignore.case = TRUE)
cardiovascular <- grep("(cardi)|(heart)|(aort)|(ather)|(pulm)|(vascular)|(ventricle)|(arter)|(hypertension)|(thrombo)", papers_joined$Title, ignore.case = TRUE)
reproductive <- grep("(follic)|(testic)|(sperm)|(oocyte)|(gesta)|(mamma)|(breast)|(Brca)|(ovar)|(estrogen)|(prostate)|(embryo)|(placenta)|(sex)", papers_joined$Title, ignore.case = TRUE)
endocrine <- grep("(pancrea)|(metabolic)|(fat)|(hypothal)|(diabet)|(pituitary)|(adipo)|(thyroid)", papers_joined$Title, ignore.case = TRUE)
hematologic <- grep("(hema)|(blood)|(thrombosis)|(leukemia)|(plasma)|(granulocyte)|(circulat)", papers_joined$Title, ignore.case = TRUE)
renal <- grep("(kidney)|(renal)|(nephro)|(Ischem)|(necros)|(fibrosis)|(urine)|(bladder)|(proxima)", papers_joined$Title, ignore.case = TRUE)
rheumatological <-  grep("(arthritis)|(rheuma)|(fibro)|(mesenchymal)|(skeletal)|(Osteopo)", papers_joined$Title, ignore.case = TRUE)
ophthalmic <- grep("(eye)|(ophth)|(visual)|(retina)", papers_joined$Title, ignore.case = TRUE)
tumour <- grep("(tumour)|(tumor)", papers_joined$Title, ignore.case = TRUE)

# Create an empty topic column# Create an empty topic column
papers_joined$Topic <- ""

# Assign topics depending on the keywords found above
papers_joined$Topic[hepatic] <- "Hepatic system"
papers_joined$Topic[neuroscience] <- "Nervous system"
papers_joined$Topic[respiratory] <- "Respiratory system"
papers_joined$Topic[immune] <- "Immune system"
papers_joined$Topic[gastrointestinal] <- "Gastrointesntinal system"
papers_joined$Topic[cardiovascular] <- "Cardiovascular system"
papers_joined$Topic[reproductive] <- "Reproductive system"
papers_joined$Topic[endocrine] <- "Endocrine system"
papers_joined$Topic[hematologic] <- "Circulatory system"
papers_joined$Topic[renal] <- "Renal system"
papers_joined$Topic[rheumatological] <- "Skeletal system"
papers_joined$Topic[ophthalmic] <- "Ophthalmic system"
papers_joined$Topic[tumour] <- "Tumour"

# Visualise topics distributions after filtering
table(papers_joined$Topic)

# See the title names of the leftovers to determine with tag they require
papers_joined$Title[papers_joined$Topic == ""]

# To calculate the number of leftover papers
sum(papers_joined$Topic == "")

# Set the seed to replicate exact random papers
set.seed(12345)

# Table generation
final_table <- Search_data %>%
  select(-PHARMACOLOGY.TOPIC, -NIHMS.ID, -First.Author) %>% #remove unecessary study details
  group_by(Topic) %>% # Statifying using 'biological system' tag
  filter(Topic != "", n() >= 5) %>% # removing topics with less than 5 papers
  sample_n(size = 5) # Selection of 5 per system

# Save as csv file for Excel data extraction (QC thresholds)
write.csv(final_table, "review_filtered_papers_tk2.csv", row.names = FALSE)


# Second table to replace the ineligible studies, PER SYSTEM

# Creating a new table from the originally read in table and the tagged table, to prevent overlap/duplication
remaining_data <- anti_join(Search_data, final_table, by = "PMID")

set.seed(12345)
final_table_2 <- remaining_data %>%
  select(-PHARMACOLOGY.TOPIC, -NIHMS.ID, -First.Author) %>%
  group_by(Topic) %>%
  filter(Topic != "", n() >= 5) %>%
  sample_n(size = 5)

write.csv(final_table_2, "review_filtered_papers_tk3.csv", row.names = FALSE)


# Third table to replace 4 ineligible studies (hence only those two topics were included)
remaining_data_2 <- anti_join(remaining_data, final_table_2, by = "PMID")

set.seed(12345)
final_table_3 <- remaining_data_2 %>%
  filter(Topic %in% c("Nervous system", "Respiratory system")) %>%
  select(-PHARMACOLOGY.TOPIC, -NIHMS.ID, -First.Author) %>%
  group_by(Topic) %>%
  filter(n() >= 5) %>%
  sample_n(size = 5)

write.csv(final_table_3, "review_filtered_papers_tk4.csv", row.names = FALSE)