View rate_limiter.py
import time | |
def rate_limited_caller(max_rps=100, sleep_time=10): | |
num_calls = 1 | |
def caller(func): | |
def wrapper(*args, **kwargs): | |
nonlocal num_calls | |
output = func(*args, **kwargs) | |
num_calls += 1 | |
if (num_calls % max_rps) == 0: |
View download_images.py
from pathlib import Path | |
import shutil | |
import requests | |
REQUIRED_COLS = ['human', 'image', 'sex_or_gender', 'ethnic_group', 'date_of_birth', 'occupation', 'loc_aid',] | |
def parse_row(row): | |
data = {} | |
for c in REQUIRED_COLS: | |
value = row[c]["value"] |
View spark_wikidata_dump.py
# Takes around 30 minutes just to show df.head() | |
%%time | |
wikidata_dump_path="/path/to/latest-all.json.bz2" | |
df = sql.read.option("multiline", "true").json(wikidata_dump_path) | |
df.head() |
View MultimodaVAE.dot
digraph G { | |
subgraph cluster_1 { | |
style=filled; | |
color=pink; | |
//node [style=filled,color=white]; | |
"x_1" -> "z_1"; | |
label="encoder1"; | |
} |
View read_glove_embeddings.py
import numpy as np | |
class Embedding(object): | |
def __init__(self, unk_token=None): | |
self.unk_token = unk_token | |
self.word2id = {unk_token: 0} | |
self.id2word = [unk_token] | |
self.vectors = [] | |
def __len__(self): |
View brown_clustering.py
# Brown Clusters | |
# Algorithm 2 taken from (Slide 15): http://aritter.github.io/courses/5525_slides/brown.pdf | |
import numpy as np | |
from collections import defaultdict | |
from scipy.sparse import csr_matrix | |
from sklearn.datasets import fetch_20newsgroups |
View wikidata_subclass.sparql
SELECT ?subClass ?subClassLabel ?desc WHERE { | |
?subClass wdt:P279* wd:Q5. # Here we are getting all subClasses of Human and its subclasses | |
OPTIONAL { | |
?subClass rdfs:label ?desc. | |
FILTER((LANG(?desc)) = "en") | |
} | |
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } | |
} |
View collect_core_data.py
API_KEY="" # Get from https://core.ac.uk/services/api/ | |
CORE_REQUEST_URL=f"https://core.ac.uk:443/api-v2/articles/get?metadata=true&fulltext=true&citations=true&similar=false&duplicate=false&urls=false&faithfulMetadata=false&apiKey={API_KEY}" | |
def get_paper_data(core_ids, batch_size=10): | |
core_ids = list(core_ids) | |
for i in range(0, len(core_ids), batch_size): | |
batch = core_ids[i:i+batch_size] | |
resp = requests.post(CORE_REQUEST_URL, json=batch) | |
batch_resp_json = resp.json() | |
yield batch_resp_json | |
print(f"Found {len(batch_resp_json)} responses. Sleeping for 2 seconds.") |
View pyspark_print_schema_indented.py
def print_schema(schema, level=0): | |
for item in schema: | |
elementType = None | |
elementTypeName = "" | |
if hasattr(item.dataType, "elementType"): | |
elementType = item.dataType.elementType | |
elementTypeName = item.dataType.elementType.typeName() | |
#print("{}{}".format("\t"*level, item)) | |
print("{}{}\t{}\t{}\t{}".format( | |
"\t"*level, |
NewerOlder