This notebook purpose is to build the binary files used by RISOTTO's GUI.
/Users/lmarti/.pyenv/versions/3.8.2/envs/risotto/lib/python3.8/site-packages/pandas/compat/__init__.py:117: UserWarning: Could not import the lzma module. Your installed Python is incomplete. Attempting to use lzma compression will result in a RuntimeError.
  warnings.warn(msg)
/Users/lmarti/.pyenv/versions/3.8.2/envs/risotto/lib/python3.8/site-packages/spacy/util.py:271: UserWarning: [W031] Model 'en_core_sci_sm' (0.2.4) requires spaCy v2.2 and is incompatible with the current spaCy version (2.3.0). This may lead to unexpected results or runtime errors. To resolve this, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate
  warnings.warn(warn_msg)
 
%load_ext autoreload
%autoreload 2
CORD19_DATASET_FOLDER = Path("./datasets/CORD-19-research-challenge")

remove_duplicated_idxs[source]

remove_duplicated_idxs(df)

load_papers_metadata[source]

load_papers_metadata(dataset_folder)

build_papers_artifact[source]

build_papers_artifact(dataset_folder, should_dump=True, dump_path='artifacts/artifacts.hdf')

Returns:

- `joined_df`: a DataFrame with the dataset metadata columns and the PageRank score
    of each paper. The rows are indexed by the `cord_uid` identifier.

load_papers_artifact[source]

load_papers_artifact(artifacts_path='artifacts/artifacts.hdf')

build_papers_topics_artifact[source]

build_papers_topics_artifact(dataset_folder, should_dump=True, dump_path='artifacts/artifacts.hdf')

Returns:

- `papers_df`: a DataFrame. Each row represents a paper and has the following columns:
    - `cord_uid`: index
    - `text`: paper text
    - `topic`: first-level topic identifier
    - `subtopic`: second-level topic identifier
- `topics_df`: a DataFrame. Each row represents a token and has the following columns:
    - `token`: index
    - `{topic_id}`: token pseudocount in first-level topic {topic_id}
    - `{topic_id}-{subtopic_id}`: token pseudocount in second-level {subtopic_id} of first-level {topic_id}.
NaNs values mean that the token wasn't considered in the topic modelling step.

load_papers_topics_artifacts[source]

load_papers_topics_artifacts(artifacts_path='artifacts/artifacts.hdf')

load_topics_artifacts[source]

load_topics_artifacts(artifacts_path='artifacts/artifacts.hdf')

build_artifacts[source]

build_artifacts(dataset_folder, should_dump=True)

build_papers_embeddings[source]

build_papers_embeddings(should_dump=True, dump_path='artifacts/artifacts.hdf')

load_papers_embeddings[source]

load_papers_embeddings(artifacts_path='artifacts/artifacts.hdf')

metapagerank_df, papers_df, topics_df = build_artifacts(CORD19_DATASET_FOLDER)
joined_df, pagerank_df, metadata_df = build_papers_artifact(CORD19_DATASET_FOLDER)