Skip to content

Instantly share code, notes, and snippets.

@jaklinger
Created May 6, 2020 07:53
Show Gist options
  • Save jaklinger/548145a28d33923e7f2b0311772f7e7b to your computer and use it in GitHub Desktop.
Save jaklinger/548145a28d33923e7f2b0311772f7e7b to your computer and use it in GitHub Desktop.
Get papers from arxiv table, including filtering bio/med/arxiv and basic keyword filtering
import pandas as pd
from data_getters.core import get_engine
def bad_tokenizer(text):
return x.lower().replace(".", "").split()
columns=['id', 'created', 'title', 'abstract', 'mag_id', 'citation_count', 'article_source']
con = get_engine("/path/to/innovation-mapping-5712.config")
chunks = pd.read_sql_table('arxiv_articles', con, columns=columns, chunksize=1000)
keywords = ('covid', 'covid-19', 'coronavirus')
covid_df = []
for i, df in enumerate(chunks):
covid = df.abstract.apply(lambda text: text is not None and any(term in text for term in keywords))
if sum(covid) == 0:
continue
covid_df.append(df.loc[covid])
covid_df = pd.concat(covid_df)
for source, subset_df in covid_df.groupby('article_source'):
print(source, len(subset_df))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment