Skip to content

Instantly share code, notes, and snippets.

View jaklinger's full-sized avatar

Joel Klinger jaklinger

View GitHub Profile
@jaklinger
jaklinger / annual_fields_of_study_usage_spec.json
Created July 20, 2021 13:20
Vega-Lite spec: annual_fields_of_study_usage from Aleks & Kostas
{
"config": {
"view": {"continuousWidth": 400, "continuousHeight": 500},
"axis": {"labelFontSize": 12, "titleFontSize": 12},
"legend": {"labelFontSize": 12, "titleFontSize": 12}
},
"concat": [
{
"mark": "bar",
"encoding": {
@jaklinger
jaklinger / read_cord.py
Created May 27, 2021 12:36
Read CORD19 data
from tempfile import TemporaryFile
import requests
import shutil
import tarfile
import csv
URL = 'https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_{date}.tar.gz'
CSV_PATH = '{date}/all_sources_metadata_{date}.csv'
def stream_to_file(url, fileobj):
@jaklinger
jaklinger / read_torch_from_s3.py
Last active June 9, 2024 14:50
Read a torch model from S3
import torch
from contextlib import contextmanager
import boto3
from io import BytesIO
from transformers import PretrainedConfig, PreTrainedModel
import json
from tempfile import NamedTemporaryFile
BUCKET_NAME = "open-jobs-lake"
import pandas as pd
import numpy as np
import time
from functools import lru_cache
WINNER_TEXT = "The winner is"
@lru_cache()
def get_data():
data = pd.read_excel("ProFinda Registration Drive (Responses).xlsx").to_dict(orient="records")
@jaklinger
jaklinger / is_tech.py
Last active September 16, 2020 16:14
Tells you whether a query is related to tech
import wikipedia
from functools import lru_cache
@lru_cache()
def get_page_cats(query):
try:
page = wikipedia.page(query, auto_suggest=False)
except (wikipedia.PageError, wikipedia.DisambiguationError):
return []
return page.categories
@jaklinger
jaklinger / arxiv_vectors.py
Last active August 11, 2020 13:31
read arxiv vectors
from nesta.core.orms.orm_utils import db_session, get_mysql_engine
from nesta.core.orms.arxiv_orm import ArticleVector
import numpy as np
import json
import os
os.environ['MYSQLDB'] = "/path/to/innovation-mapping-5712.config"
def query_and_bundle(session, fields, start, limit, filter_):
q = session.query(*fields)
@jaklinger
jaklinger / schema_dump.py
Created June 1, 2020 13:30
dump schema from data_getters
from collections import defaultdict
ignore = ["2020", "onsOpenGeo", 'worldbank', 'table_updates']
out_data = defaultdict(dict)
for dataset, tables in schemas.items():
for name, table in tables.items():
if any(i in name for i in ignore):
continue
table = [dict(field_name=str(c.key), type=str(c.type), primary_key=c.primary_key, nullable=c.nullable)
for c in table.columns]
out_data[dataset][name] = table
@jaklinger
jaklinger / get_covid_xiv.py
Created May 6, 2020 07:53
Get papers from arxiv table, including filtering bio/med/arxiv and basic keyword filtering
import pandas as pd
from data_getters.core import get_engine
def bad_tokenizer(text):
return x.lower().replace(".", "").split()
columns=['id', 'created', 'title', 'abstract', 'mag_id', 'citation_count', 'article_source']
con = get_engine("/path/to/innovation-mapping-5712.config")
chunks = pd.read_sql_table('arxiv_articles', con, columns=columns, chunksize=1000)
keywords = ('covid', 'covid-19', 'coronavirus')
@jaklinger
jaklinger / after_decorator.py
Created May 5, 2020 15:42
after decorator example
def do_the_other_thing(run, output):
def wrap(self):
run(self)
output(self)
return wrap
class A:
name='a'
def run(self):
pass
@jaklinger
jaklinger / assym.py
Created November 1, 2019 16:43
Asymmetry measurement of a square matrix
import numpy as np
def assym(a):
return 1 - (np.linalg.det(0.5*(a + a.T)) / np.linalg.det(a))
for a in ([[10,123,0],[123,10,0],[0,0,10]], [[10,123,0],[121,10,0],[0,0,10]],
[[10,123,0],[50,10,0],[0,0,10]], [[10,123,0],[0,10,0],[23,0,10]],
[[10,123,0],[-123,10,0],[5422,0,10]]):
a = np.matrix(a)
print(a)