Skip to content

Instantly share code, notes, and snippets.

Avatar
🎯
Focusing

Shubhanshu Mishra napsternxg

🎯
Focusing
View GitHub Profile
@napsternxg
napsternxg / download_images.py
Created Sep 29, 2020
Get humans with sex, gender, ethnic_group, and image in Wikidata https://w.wiki/eLe
View download_images.py
from pathlib import Path
import shutil
import requests
REQUIRED_COLS = ['human', 'image', 'sex_or_gender', 'ethnic_group', 'date_of_birth', 'occupation', 'loc_aid',]
def parse_row(row):
data = {}
for c in REQUIRED_COLS:
value = row[c]["value"]
@napsternxg
napsternxg / spark_wikidata_dump.py
Created Aug 21, 2020
Reading Wikidata dumps via Spark
View spark_wikidata_dump.py
# Takes around 30 minutes just to show df.head()
%%time
wikidata_dump_path="/path/to/latest-all.json.bz2"
df = sql.read.option("multiline", "true").json(wikidata_dump_path)
df.head()
@napsternxg
napsternxg / MultimodaVAE.dot
Last active Aug 15, 2020
Multimodal VAE graphical modal paste the code: https://dreampuf.github.io/GraphvizOnline/
View MultimodaVAE.dot
digraph G {
subgraph cluster_1 {
style=filled;
color=pink;
//node [style=filled,color=white];
"x_1" -> "z_1";
label="encoder1";
}
View read_glove_embeddings.py
import numpy as np
class Embedding(object):
def __init__(self, unk_token=None):
self.unk_token = unk_token
self.word2id = {unk_token: 0}
self.id2word = [unk_token]
self.vectors = []
def __len__(self):
View brown_clustering.py
# Brown Clusters
# Algorithm 2 taken from (Slide 15): http://aritter.github.io/courses/5525_slides/brown.pdf
import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups
@napsternxg
napsternxg / wikidata_subclass.sparql
Created Jul 14, 2020
Wikidata get all subclasses of a given class
View wikidata_subclass.sparql
SELECT ?subClass ?subClassLabel ?desc WHERE {
?subClass wdt:P279* wd:Q5. # Here we are getting all subClasses of Human and its subclasses
OPTIONAL {
?subClass rdfs:label ?desc.
FILTER((LANG(?desc)) = "en")
}
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
@napsternxg
napsternxg / collect_core_data.py
Created Jun 12, 2020
Core Data for WOSP 2020 3C shared task.
View collect_core_data.py
API_KEY="" # Get from https://core.ac.uk/services/api/
CORE_REQUEST_URL=f"https://core.ac.uk:443/api-v2/articles/get?metadata=true&fulltext=true&citations=true&similar=false&duplicate=false&urls=false&faithfulMetadata=false&apiKey={API_KEY}"
def get_paper_data(core_ids, batch_size=10):
core_ids = list(core_ids)
for i in range(0, len(core_ids), batch_size):
batch = core_ids[i:i+batch_size]
resp = requests.post(CORE_REQUEST_URL, json=batch)
batch_resp_json = resp.json()
yield batch_resp_json
print(f"Found {len(batch_resp_json)} responses. Sleeping for 2 seconds.")
@napsternxg
napsternxg / pyspark_print_schema_indented.py
Created Apr 29, 2020
Print pyspark schema in indented format
View pyspark_print_schema_indented.py
def print_schema(schema, level=0):
for item in schema:
elementType = None
elementTypeName = ""
if hasattr(item.dataType, "elementType"):
elementType = item.dataType.elementType
elementTypeName = item.dataType.elementType.typeName()
#print("{}{}".format("\t"*level, item))
print("{}{}\t{}\t{}\t{}".format(
"\t"*level,
View newsguardtech_covid_misinfo_urls.json
[{
"text": "ActivistPost.com",
"href": "https://www.newsguardtech.com/wp-content/uploads/2020/03/activistpost.com-ENG.pdf",
"country": "United States"
}, {
"text": "AmericanThinker.com",
"href": "https://www.newsguardtech.com/wp-content/uploads/2020/03/AmericanThinker.com-3-24-20.pdf",
"country": "United States"
}, {
"text": "BeforeItsNews.com",
View WRP_national.csv
We can't make this file beautiful and searchable because it's too large.
year,state,name,chrstprot,chrstcat,chrstorth,chrstang,chrstothr,chrstgen,judorth,jdcons,judref,judothr,judgen,islmsun,islmshi,islmibd,islmnat,islmalw,islmahm,islmothr,islmgen,budmah,budthr,budothr,budgen,zorogen,hindgen,sikhgen,shntgen,bahgen,taogen,jaingen,confgen,syncgen,anmgen,nonrelig,othrgen,sumrelig,pop,chrstprotpct,chrstcatpct,chrstorthpct,chrstangpct,chrstothrpct,chrstgenpct,judorthpct,judconspct,judrefpct,judothrpct,judgenpct,islmsunpct,islmshipct,islmibdpct,islmnatpct,islmalwpct,islmahmpct,islmothrpct,islmgenpct,budmahpct,budthrpct,budothrpct,budgenpct,zorogenpct,hindgenpct,sikhgenpct,shntgenpct,bahgenpct,taogenpct,jaingenpct,confgenpct,syncgenpct,anmgenpct,nonreligpct,othrgenpct,sumreligpct,total,dualrelig,datatype,sourcereliab,recreliab,reliabilevel,Version,sourcecode
1945,2,USA,66069671,38716742,1121898,2400000,1956807,110265118,821489,1364508,1902885,552300,4641182,0,0,0,0,0,0,0,0,0,0,1601218,1601218,0,0,0,0,0,0,0,0,0,0,22874544,545938,139382062,139928000,0.4722,0.2767,0.0080,0.0172,0.0140,0.788
You can’t perform that action at this time.