This notebooks explores how much information we can obtain from the citation references of the papers in the COVID-19 Open Research Dataset Challenge (CORD-19).
 
%load_ext autoreload
%autoreload 2

Loading the CORD-19 dataset

cord19_dataset_folder = "./datasets/CORD-19-research-challenge"
if Path(cord19_dataset_folder).exists():
    print('Good to go')
else:
    print(f'{data_root} does not exist! Download it using 00_download.ipynb.')

Loading metadata.csv file as a pandas DataFrame.

metadata_df = pd.read_csv(f"{cord19_dataset_folder}/metadata.csv", index_col="cord_uid")

How metadata looks like?

file_in_metadata_count = len(metadata_df)
f'Total records loaded: {file_in_metadata_count}'
metadata_df.columns
(metadata_df["pdf_json_files"].isna() & metadata_df["pmc_json_files"].isna()).sum()

Each paper is represented by a unique id called cord_uid.

Meaning of columns:

  • sha: PDF file hash
  • source_x: source repository, e.g. Biorxiv, Elsevier, etc.
  • title: paper title
  • doi, pmcid, pubmed_id, Microsoft Academic Paper ID, WHO #Covidence: other document ids
  • license: usage license
  • abstract:plain text abstract
  • publish_time: publish date
  • journal: academic journal of publication, if applicable
  • authors: authors in plain text
  • has_pdf_parse: if PDF parsing is available
  • has_pmc_xml_parse: if PubMed XML is available
  • full_text_file: pointer to the source file in the dataset
  • url: URL to paper online source

The following cells will explore the folders and files structure of the dataset.

def _sample_json_file(path):
    for _, _, file_names in os.walk(path):
        sample_file_name = random.choice(file_names)
        file_path = os.path.join(path, sample_file_name)
        with open(file_path) as file:
            contents = json.load(file)
        return list(contents.keys())
def walk_dataset():
    cum_sum = 0
    for root, folders, files in os.walk(cord19_dataset_folder):
        num_folders = len(folders)
        num_files = len(files)
        if "json" in root and num_folders == 0 and num_files > 0:
            cum_sum += num_files
            print(f"{root}: {num_files} files")
    return cum_sum
source_file_count = walk_dataset()
print(f'Files in metadata: {file_in_metadata_count}, source files: {source_file_count}')
print(f'We have {source_file_count-file_in_metadata_count} files without metadata.')

According to the information found in the Kaggle community site, there are at least two recommended procedures to load the dataset.

# First method
for row in metadata_file:
    pmc_id = row["pmc_id"]
    if exists pmc_id file in pmc_jsons subfolders:
        return pmc_id file
    # If `pmc_id` is null or there is no file
    sha = row["sha"]
    if exists sha file in pdf_jsons subfolders:
        return sha file

# Second method
for pdf file in pdf files:
    if pdf file sha not in metadata_file shas:
        continue
    else:
        row = metadata_file row with matching pdf file sha
        pmc_id = row["pmc_id"]
        if exists pmc_id file in pmc_jsons subfolders:
            return pmc_id file
        return pdf file
def get_id_paths_dicts(cls, cord19_dataset_folder):
    """
    DEPRECATED.
    This function builds the dictionaries whose keys are identifiers
    of some kind of papers (PDF or PMC) and whose values are the paths to
    associated files.
    """
    all_files = {}
    for root, folders, files in os.walk(cord19_dataset_folder):
        num_folders = len(folders)
        num_files = len(files)
        if cls in root and num_folders == 0 and num_files > 0:
            for file_name in files:
                _id = file_name.split(".")[0]
                all_files[_id] = os.path.join(root, file_name)
    return all_files

get_id_paths_dicts[source]

get_id_paths_dicts(cord19_dataset_folder)

DEPRECATED. This function builds the dictionaries whose keys are identifiers of some kind of papers (PDF or PMC) and whose values are the paths to associated files.

pdf_dict = get_id_paths_dicts("pdf", cord19_dataset_folder)
pmc_dict = get_id_paths_dicts("pmc", cord19_dataset_folder)
print(f'Number of PDF files with parse JSON {len(pdf_dict)}')
print(f'Number of PMC files with parse JSON {len(pmc_dict)}')

Loading PDF and PMC papers

class BasePaper:
    def __init__(self, metadata_row, file_path):
        self._metadata_row = metadata_row
        self._file_path = file_path
        self._file_contents = self._load_json_contents(file_path)
        
        self._referenced_by = []
        self._references = []
    
    def __getstate__(self):
        """
        Avoid RecursionErrors by not pickling references.
        """
        state = self.__dict__.copy()
        del state["_referenced_by"]
        del state["_references"]
        return state
    
    def __setstate__(self, state):
        self.__dict__.update(state)
        self._referenced_by = []
        self._references = []
        
    @staticmethod
    def _load_json_contents(path):
        with open(path) as file:
            contents = json.load(file)
        return contents

    @property
    def title(self):
        return self._metadata_row["title"]
        
    @property
    def authors(self):
        return self._metadata_row["authors"]
        
    @property
    def publish_time(self):
        return self._metadata_row["publish_time"]
        
    @property
    def abstract(self):
        return self._metadata_row["abstract"]
        
    @property
    def bib_entries(self):
        return self._file_contents["bib_entries"]
    
    @property
    def doi(self):
        return self._metadata_row["doi"]

    @property
    def url(self):
        return self._metadata_row["url"]
    
    def register_reference(self, reference):
        self._references.append(reference)
        reference.register_referenced(self)
    
    def register_referenced(self, referenced):
        self._referenced_by.append(referenced)

class BasePaper[source]

BasePaper(metadata_row, file_path)

class PDFPaper(BasePaper):
    pass

class PDFPaper[source]

PDFPaper(metadata_row, file_path) :: BasePaper

class PMCPaper(BasePaper):
    pass

class PMCPaper[source]

PMCPaper(metadata_row, file_path) :: BasePaper

def load_papers_from_metadata_file(cord19_dataset_folder):
    'Loads papers by reading the `metadata.csv` file present in the dataset.'
    metadata_df = pd.read_csv(f"{cord19_dataset_folder}/metadata.csv",
                              index_col="cord_uid")
    """
    pdf_dict = get_id_paths_dicts("pdf", cord19_dataset_folder)
    pmc_dict = get_id_paths_dicts("pmc", cord19_dataset_folder)
    """

    papers = []
    # not_found = []
    for idx, row in progress_bar(metadata_df.iterrows(),
                                 total=len(metadata_df)):
        
        paper = None
        pmc_rel_path = row["pmc_json_files"]
        pdf_rel_path = row["pdf_json_files"]
        
        if pd.notna(pmc_rel_path):
            path = os.path.join(cord19_dataset_folder, pmc_rel_path)
            paper = PMCPaper(row, path)
        
        if pd.notna(pdf_rel_path):
            path_splitted = pdf_rel_path.split("; ")[0]
            path = os.path.join(cord19_dataset_folder, path_splitted)
            paper = PDFPaper(row, path)
        
        if paper is not None:
            papers.append(paper)

    return papers

load_papers_from_metadata_file[source]

load_papers_from_metadata_file(cord19_dataset_folder)

Loads papers by reading the metadata.csv file present in the dataset.

papers = load_papers_from_metadata_file(cord19_dataset_folder)
print(f'{len(papers)} papers found')
mask_1 = metadata_df["pmc_json_files"].notna()
mask_2 = metadata_df["pdf_json_files"].notna()
print(
    f'There are {(mask_1 | mask_2).sum()} files with either parsed PDF or PMC XML.'
)

Peeking at the paper info we just loaded

def paper_as_markdown(paper):
    from IPython.display import display, Markdown
    res = f"""
- **Title:** {paper.title}
- **Authors:** {paper.authors}
- **Publish date/time:** {paper.publish_time}
- **Linked references:** {len(paper._references)}
- **Linked referenced by:** {len(paper._referenced_by)}
- **Abstract:** {paper.abstract}"""
    display(Markdown(res))

paper_as_markdown[source]

paper_as_markdown(paper)

paper = random.sample(papers, 1)[0]
paper_as_markdown(paper)

Matching references to titles

We now make the match between the titles (in plain text) of the references of each paper and their titles.

A dictionary to map titles to papers.

def process_references(papers):
    paper_titles = {}
    for paper in progress_bar(papers):
        title = paper.title
        if isinstance(title, str):  # coping with NaNs in titles
            paper_titles[title.lower()] = paper

    num_processed_refs = 0
    num_succesfully_processed_refs = 0
    for paper in progress_bar(papers):
        for _, ref in paper.bib_entries.items():
            ref_title = ref["title"].lower()
            if ref_title in paper_titles:
                paper.register_reference(paper_titles[ref_title])
                num_succesfully_processed_refs += 1
        num_processed_refs += len(paper.bib_entries)
    return num_processed_refs, num_succesfully_processed_refs

process_references[source]

process_references(papers)

num_processed_refs, num_succesfully_processed_refs = process_references(papers)
print(f'Total references: {num_processed_refs}.')
print(f'References present in the dataset {num_succesfully_processed_refs} ({num_succesfully_processed_refs/num_processed_refs*100}%).')

One strategy to mitigate that less than 7% of references are also in the dataset is to create special nodes for the papers outside the set, in this way the structure of the graph is better preserved.

On the other hand, other questions must be answered:

  • How many papers are referenced at least once in the data set?
  • What are the most referenced papers in the dataset? Long tail?
  • What are the most referenced papers outside the dataset? Long tail?
num_references = 0
num_referenced_by = 0
for paper in papers:
    if len(paper._references) > 0:
        num_references += 1
    if len(paper._referenced_by) > 0:
        num_referenced_by += 1

Questions:

  • How many papers have at least one referece correctly linked?
  • How many papers have been references at least once?
print(f'Total papers: {len(papersld)}.')
print(f'Number of references {num_references} ({num_references/len(papers)*100}%).')
print(f'Number of referenced-by {num_referenced_by} ({num_referenced_by/len(papers)*100}%).')

What are the most reference papers?

papers_sorted = sorted(papers, key=lambda p: len(p._referenced_by), reverse=True)

Most cited paper:

paper_as_markdown(papers_sorted[0])

... and its refereces.

for paper in papers_sorted[0]._references:
     paper_as_markdown(paper)

Plot time!

Note that the dataset includes papers published before the current pandemic started. The following cells will plot the number of references distribution.

num_references = sorted([len(paper._references) for paper in papers],
                        reverse=True)
num_referenced_by = sorted([len(paper._referenced_by) for paper in papers],
                           reverse=True)
fig = plt.figure(figsize=(11, 8))
ax = sns.lineplot(y=num_references, x=range(len(papers)))
ax.set(title="Correctly linked paper references", yscale="log");
fig = plt.figure(figsize=(11, 8))
ax = sns.lineplot(y=num_referenced_by, x=range(len(papers)))
ax.set(title="References to papers correclty linked", yscale="log");

From the plot it's concluded that the number of linked references follow a power law, with a tiny bit of them accounting for most of the references.

PageRank

The following cells will compute the PageRank score based on the previously built paper references link.

build_papers_reference_graph[source]

build_papers_reference_graph(papers)

G = build_papers_reference_graph(papers)
G.number_of_nodes(), G.number_of_edges()
pr = nx.pagerank(G)
sorted_pr = {k: v for k, v in sorted(pr.items(), key=lambda item: item[1], reverse=True)}

Top-ranked papers in the graph

for paper in list(sorted_pr.keys())[:5]:
     paper_as_markdown(paper)

Here it can be seen that the papers with the highest PageRank are highly cited and in general are works from past decades that probably constitute the basis of current research against COVID-19.

The distribution of PageRank scores will be shown below.

pr_values = np.array(list(sorted_pr.values()))

# Removing outliers
pr_mean = np.mean(pr_values)
pr_std = np.std(pr_values)
pr_distance = abs(pr_values - pr_mean)
max_std = 1.5
pr_not_outlier = pr_distance < max_std * pr_std
pr_no_outliers = pr_values[pr_not_outlier]

len(pr_values), len(pr_no_outliers), len(pr_values) - len(pr_no_outliers)
fig = plt.figure(figsize=(11, 8))
ax = sns.distplot(pr_values, kde=False, rug=True)
ax.set(title="Distribution of PageRank of papers including outliers", yscale="log");
fig = plt.figure(figsize=(11, 8))
ax = sns.distplot(pr_no_outliers, kde=False, rug=True)
ax.set(title="Distribution of PageRank of papers including outliers", yscale="log");