%load_ext autoreload
%autoreload 2
cord19_dataset_folder = "./datasets/CORD-19-research-challenge"
if Path(cord19_dataset_folder).exists():
print('Good to go')
else:
print(f'{data_root} does not exist! Download it using 00_download.ipynb.')
Loading metadata.csv
file as a pandas DataFrame
.
metadata_df = pd.read_csv(f"{cord19_dataset_folder}/metadata.csv", index_col="cord_uid")
How metadata looks like?
file_in_metadata_count = len(metadata_df)
f'Total records loaded: {file_in_metadata_count}'
metadata_df.columns
(metadata_df["pdf_json_files"].isna() & metadata_df["pmc_json_files"].isna()).sum()
Each paper is represented by a unique id called cord_uid
.
Meaning of columns:
sha
: PDF file hashsource_x
: source repository, e.g. Biorxiv, Elsevier, etc.title
: paper titledoi
,pmcid
,pubmed_id
,Microsoft Academic Paper ID
,WHO #Covidence
: other document idslicense
: usage licenseabstract
:plain text abstractpublish_time
: publish datejournal
: academic journal of publication, if applicableauthors
: authors in plain texthas_pdf_parse
: if PDF parsing is availablehas_pmc_xml_parse
: if PubMed XML is availablefull_text_file
: pointer to the source file in the dataseturl
: URL to paper online source
The following cells will explore the folders and files structure of the dataset.
def _sample_json_file(path):
for _, _, file_names in os.walk(path):
sample_file_name = random.choice(file_names)
file_path = os.path.join(path, sample_file_name)
with open(file_path) as file:
contents = json.load(file)
return list(contents.keys())
def walk_dataset():
cum_sum = 0
for root, folders, files in os.walk(cord19_dataset_folder):
num_folders = len(folders)
num_files = len(files)
if "json" in root and num_folders == 0 and num_files > 0:
cum_sum += num_files
print(f"{root}: {num_files} files")
return cum_sum
source_file_count = walk_dataset()
print(f'Files in metadata: {file_in_metadata_count}, source files: {source_file_count}')
print(f'We have {source_file_count-file_in_metadata_count} files without metadata.')
According to the information found in the Kaggle community site, there are at least two recommended procedures to load the dataset.
# First method
for row in metadata_file:
pmc_id = row["pmc_id"]
if exists pmc_id file in pmc_jsons subfolders:
return pmc_id file
# If `pmc_id` is null or there is no file
sha = row["sha"]
if exists sha file in pdf_jsons subfolders:
return sha file
# Second method
for pdf file in pdf files:
if pdf file sha not in metadata_file shas:
continue
else:
row = metadata_file row with matching pdf file sha
pmc_id = row["pmc_id"]
if exists pmc_id file in pmc_jsons subfolders:
return pmc_id file
return pdf file
def get_id_paths_dicts(cls, cord19_dataset_folder):
"""
DEPRECATED.
This function builds the dictionaries whose keys are identifiers
of some kind of papers (PDF or PMC) and whose values are the paths to
associated files.
"""
all_files = {}
for root, folders, files in os.walk(cord19_dataset_folder):
num_folders = len(folders)
num_files = len(files)
if cls in root and num_folders == 0 and num_files > 0:
for file_name in files:
_id = file_name.split(".")[0]
all_files[_id] = os.path.join(root, file_name)
return all_files
pdf_dict = get_id_paths_dicts("pdf", cord19_dataset_folder)
pmc_dict = get_id_paths_dicts("pmc", cord19_dataset_folder)
print(f'Number of PDF files with parse JSON {len(pdf_dict)}')
print(f'Number of PMC files with parse JSON {len(pmc_dict)}')
class BasePaper:
def __init__(self, metadata_row, file_path):
self._metadata_row = metadata_row
self._file_path = file_path
self._file_contents = self._load_json_contents(file_path)
self._referenced_by = []
self._references = []
def __getstate__(self):
"""
Avoid RecursionErrors by not pickling references.
"""
state = self.__dict__.copy()
del state["_referenced_by"]
del state["_references"]
return state
def __setstate__(self, state):
self.__dict__.update(state)
self._referenced_by = []
self._references = []
@staticmethod
def _load_json_contents(path):
with open(path) as file:
contents = json.load(file)
return contents
@property
def title(self):
return self._metadata_row["title"]
@property
def authors(self):
return self._metadata_row["authors"]
@property
def publish_time(self):
return self._metadata_row["publish_time"]
@property
def abstract(self):
return self._metadata_row["abstract"]
@property
def bib_entries(self):
return self._file_contents["bib_entries"]
@property
def doi(self):
return self._metadata_row["doi"]
@property
def url(self):
return self._metadata_row["url"]
def register_reference(self, reference):
self._references.append(reference)
reference.register_referenced(self)
def register_referenced(self, referenced):
self._referenced_by.append(referenced)
class PDFPaper(BasePaper):
pass
class PMCPaper(BasePaper):
pass
def load_papers_from_metadata_file(cord19_dataset_folder):
'Loads papers by reading the `metadata.csv` file present in the dataset.'
metadata_df = pd.read_csv(f"{cord19_dataset_folder}/metadata.csv",
index_col="cord_uid")
"""
pdf_dict = get_id_paths_dicts("pdf", cord19_dataset_folder)
pmc_dict = get_id_paths_dicts("pmc", cord19_dataset_folder)
"""
papers = []
# not_found = []
for idx, row in progress_bar(metadata_df.iterrows(),
total=len(metadata_df)):
paper = None
pmc_rel_path = row["pmc_json_files"]
pdf_rel_path = row["pdf_json_files"]
if pd.notna(pmc_rel_path):
path = os.path.join(cord19_dataset_folder, pmc_rel_path)
paper = PMCPaper(row, path)
if pd.notna(pdf_rel_path):
path_splitted = pdf_rel_path.split("; ")[0]
path = os.path.join(cord19_dataset_folder, path_splitted)
paper = PDFPaper(row, path)
if paper is not None:
papers.append(paper)
return papers
papers = load_papers_from_metadata_file(cord19_dataset_folder)
print(f'{len(papers)} papers found')
mask_1 = metadata_df["pmc_json_files"].notna()
mask_2 = metadata_df["pdf_json_files"].notna()
print(
f'There are {(mask_1 | mask_2).sum()} files with either parsed PDF or PMC XML.'
)
def paper_as_markdown(paper):
from IPython.display import display, Markdown
res = f"""
- **Title:** {paper.title}
- **Authors:** {paper.authors}
- **Publish date/time:** {paper.publish_time}
- **Linked references:** {len(paper._references)}
- **Linked referenced by:** {len(paper._referenced_by)}
- **Abstract:** {paper.abstract}"""
display(Markdown(res))
paper = random.sample(papers, 1)[0]
paper_as_markdown(paper)
A dictionary to map titles to papers.
def process_references(papers):
paper_titles = {}
for paper in progress_bar(papers):
title = paper.title
if isinstance(title, str): # coping with NaNs in titles
paper_titles[title.lower()] = paper
num_processed_refs = 0
num_succesfully_processed_refs = 0
for paper in progress_bar(papers):
for _, ref in paper.bib_entries.items():
ref_title = ref["title"].lower()
if ref_title in paper_titles:
paper.register_reference(paper_titles[ref_title])
num_succesfully_processed_refs += 1
num_processed_refs += len(paper.bib_entries)
return num_processed_refs, num_succesfully_processed_refs
num_processed_refs, num_succesfully_processed_refs = process_references(papers)
print(f'Total references: {num_processed_refs}.')
print(f'References present in the dataset {num_succesfully_processed_refs} ({num_succesfully_processed_refs/num_processed_refs*100}%).')
One strategy to mitigate that less than 7% of references are also in the dataset is to create special nodes for the papers outside the set, in this way the structure of the graph is better preserved.
On the other hand, other questions must be answered:
- How many papers are referenced at least once in the data set?
- What are the most referenced papers in the dataset? Long tail?
- What are the most referenced papers outside the dataset? Long tail?
num_references = 0
num_referenced_by = 0
for paper in papers:
if len(paper._references) > 0:
num_references += 1
if len(paper._referenced_by) > 0:
num_referenced_by += 1
Questions:
- How many papers have at least one referece correctly linked?
- How many papers have been references at least once?
print(f'Total papers: {len(papersld)}.')
print(f'Number of references {num_references} ({num_references/len(papers)*100}%).')
print(f'Number of referenced-by {num_referenced_by} ({num_referenced_by/len(papers)*100}%).')
What are the most reference papers?
papers_sorted = sorted(papers, key=lambda p: len(p._referenced_by), reverse=True)
Most cited paper:
paper_as_markdown(papers_sorted[0])
... and its refereces.
for paper in papers_sorted[0]._references:
paper_as_markdown(paper)
num_references = sorted([len(paper._references) for paper in papers],
reverse=True)
num_referenced_by = sorted([len(paper._referenced_by) for paper in papers],
reverse=True)
fig = plt.figure(figsize=(11, 8))
ax = sns.lineplot(y=num_references, x=range(len(papers)))
ax.set(title="Correctly linked paper references", yscale="log");
fig = plt.figure(figsize=(11, 8))
ax = sns.lineplot(y=num_referenced_by, x=range(len(papers)))
ax.set(title="References to papers correclty linked", yscale="log");
G = build_papers_reference_graph(papers)
G.number_of_nodes(), G.number_of_edges()
pr = nx.pagerank(G)
sorted_pr = {k: v for k, v in sorted(pr.items(), key=lambda item: item[1], reverse=True)}
Top-ranked papers in the graph
for paper in list(sorted_pr.keys())[:5]:
paper_as_markdown(paper)
Here it can be seen that the papers with the highest PageRank are highly cited and in general are works from past decades that probably constitute the basis of current research against COVID-19.
The distribution of PageRank scores will be shown below.
pr_values = np.array(list(sorted_pr.values()))
# Removing outliers
pr_mean = np.mean(pr_values)
pr_std = np.std(pr_values)
pr_distance = abs(pr_values - pr_mean)
max_std = 1.5
pr_not_outlier = pr_distance < max_std * pr_std
pr_no_outliers = pr_values[pr_not_outlier]
len(pr_values), len(pr_no_outliers), len(pr_values) - len(pr_no_outliers)
fig = plt.figure(figsize=(11, 8))
ax = sns.distplot(pr_values, kde=False, rug=True)
ax.set(title="Distribution of PageRank of papers including outliers", yscale="log");
fig = plt.figure(figsize=(11, 8))
ax = sns.distplot(pr_no_outliers, kde=False, rug=True)
ax.set(title="Distribution of PageRank of papers including outliers", yscale="log");