Skip to content

Instantly share code, notes, and snippets.

@Helw150
Last active May 6, 2024 17:33
Show Gist options
  • Save Helw150/fc1460eca13e4c1657aa86b5ca84d5cc to your computer and use it in GitHub Desktop.
Save Helw150/fc1460eca13e4c1657aa86b5ca84d5cc to your computer and use it in GitHub Desktop.
import ast
# To Delete After Debug
import code
import copyreg
import datetime
import functools
import json
import os
import re
import time
from ast import literal_eval
from collections import defaultdict
from urllib.parse import urlsplit
import gnureadline
import numpy as np
import openai
import pandas as pd
import ray
import tiktoken
from countryguess import guess_country
from fuzzysearch import find_near_matches
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
def load_mapping():
with open("domains.csv", "r") as f:
lines = f.readlines()
return {line.split(",")[0].strip(): line.split(",")[1].strip() for line in lines}
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
def pickle_Encoding(enc):
return (
functools.partial(
tiktoken.core.Encoding,
enc.name,
pat_str=enc._pat_str,
mergeable_ranks=enc._mergeable_ranks,
special_tokens=enc._special_tokens,
),
(),
)
copyreg.pickle(tiktoken.core.Encoding, pickle_Encoding)
countries = (
"("
+ "|".join(
[
re.escape(country)
for country in open("list-of-countries.txt", "r").read().split("\n")
]
+ [","]
)
+ ")"
)
# Manually add Chinese since it's not in the taxonomy
langs = (
r"\b("
+ "|".join([re.escape(lang) for lang in pd.read_csv("lang2tax.txt").language.array])
+ "|Chinese"
+ r")\b"
)
lang_pattern = re.compile(langs, flags=re.IGNORECASE)
uni_db = pd.read_csv("world-universities.csv").astype(str)
uni_db["uni_website"] = uni_db["uni_website"].map(
lambda x: urlsplit(x).netloc.replace("www.", "")
)
uni_db = uni_db.set_index(["uni_name"]).sort_index()
zcdb = pd.read_csv(
"allCountries.txt",
sep="\t",
dtype=str,
names=[
"country_code",
"postal_code",
"place_name",
"admin1_name",
"admin1_code",
"admin2_name",
"admin2_code",
"admin3_name",
"admin3_code",
"latitude",
"longitude",
"accuracy",
],
).set_index(["postal_code"])
cedex_pattern = r"CEDEX( [0-9])?"
zcdb.index = zcdb.index.map(str).map(lambda x: re.sub(cedex_pattern, "", x))
zcdb = zcdb.astype(str).sort_index()
country_pattern = re.compile(countries, flags=re.IGNORECASE)
zip_patterns = [
r"[0-9]{4,5}",
r"(?i)([a-z0-9][a-z0-9\- ]{0,10}[a-z0-9])",
] # First Pattern for the most common formats, second captures all global formats in the GeoNames DB
email_mapping = load_mapping()
def get_prefixes(acl_id):
prefix = acl_id[0:] if acl_id[0].isalpha() else acl_id[5:]
first_prefix = prefix[0].upper()
second_prefix = prefix.split("-")[0]
return first_prefix, second_prefix
def get_json(row):
acl_id = row["acl_id"]
first_prefix, second_prefix = get_prefixes(acl_id)
try:
row["json"] = open(
f"./Base_JSON/prefix{first_prefix}/json/{second_prefix}/{acl_id}.json", "r"
).read()
except:
prior = row["url"].split("/")[-3][-2:]
latter = row["url"].split("/")[-1].replace(".pdf", "")
print(
f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json"
)
try:
row["json"] = open(
f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json",
"r",
).read()
except:
row["json"] = None
return row
def get_locations(author):
affiliation = author["affiliation"]
if "email" in author and author["email"].split(".")[-1] in email_mapping:
return [email_mapping[author["email"].split(".")[-1]]]
if "location" in affiliation and "country" in affiliation["location"]:
countries = []
for country in re.split(
country_pattern,
affiliation["location"]["country"].replace(" and ", ","),
):
guess = guess_country(country)
if guess:
countries.append(guess["name_short"])
if len(countries) > 0:
return countries
if "location" in affiliation and len(affiliation["location"]) > 0:
location = defaultdict(str, affiliation["location"])
full_addr = f"{location['addrLine']} {location['settlement']} {location['region']} {location['postCode']}"
if "postCode" in location:
code_matches = []
zip_codes = [
match
for pattern in zip_patterns
for match in re.findall(pattern, full_addr)
]
zip_codes = [code for code in zip_codes if code in zcdb.index]
zipc = zcdb.loc[zip_codes]
for code in zipc.iloc:
if (
code.place_name in full_addr
or (code.admin1_code in full_addr and code.admin1_code.isalpha())
or code.admin1_name in full_addr
):
code_matches.append(guess_country(code.country_code)["name_short"])
if len(code_matches) > 0:
return code_matches
if "email" in author and author["email"]:
author_website = author["email"].split("@")[-1]
if author_website in uni_db["uni_website"].array:
return [
guess_country(
uni_db[uni_db["uni_website"] == author_website].iloc[0][
"country_code"
]
)["name_short"]
]
author_tld_only = (
".".join(author_website.split(".")[-2:])
if len(author_website.split(".")) > 2
else None
)
if author_tld_only and author_tld_only in uni_db["uni_website"].array:
return [
guess_country(
uni_db[uni_db["uni_website"] == author_tld_only].iloc[0][
"country_code"
]
)["name_short"]
]
tld_only = uni_db.uni_website.map(lambda x: ".".join(x.split(".")[-2:]))
if author_website in tld_only.array:
return [
guess_country(
uni_db[tld_only == author_website].iloc[0]["country_code"]
)["name_short"]
]
if "institution" in affiliation or "laboratory" in affiliation:
lab_and_school = f"{affiliation['laboratory']} {affiliation['institution']}"
school_matches = []
for uni in uni_db.index.array:
cand_match = find_near_matches(
uni, lab_and_school, max_l_dist=1, max_substitutions=0, max_deletions=0
)
if len(cand_match) > 0:
match = cand_match[0].matched
school_matches.append((match, uni))
c = school_matches
school_matches = [
match[1]
for match in school_matches
if all(
[
(match[0] not in c_match[0] or match[1] == c_match[1])
for c_match in school_matches
]
)
]
if len(school_matches) > 0:
countries = []
for match in school_matches:
country_info = uni_db.loc[match].country_code
if type(country_info) == type("test") and country_info != "nan":
countries.append(guess_country(country_info)["name_short"])
if len(countries) > 0:
return countries
return []
def get_institutions(row):
row["countries"] = set(
filter(
lambda x: str(x) == x,
[
location
for author in json.loads(row["json"])["authors"]
for location in get_locations(author)
],
)
)
return row
def add_json(rdf):
return rdf.map(get_json).filter(lambda row: row["json"] != None)
def dedupe_country(row):
countries = row["countries"]
normalized_countries = []
for country in countries:
if country == "Europe":
normalized_countries.append(country)
else:
normalized = guess_country(country)
normalized_countries.append(normalized["name_short"])
row["countries"] = set(normalized_countries)
return row
def add_country(rdf):
return (
rdf.map(get_institutions)
.filter(lambda row: len(row["countries"]) > 0)
.map(dedupe_country)
)
def check_lang(row):
row["langs"] = [
match.lower().capitalize()
for match in re.findall(lang_pattern, str(row["full_text"]))
]
return row
def re_list(row):
if "countries" in row:
row["countries"] = row["countries"].split("[SEP]")
if "langs" in row:
row["langs"] = row["langs"].split("[SEP]")
return row
sample_5 = functools.partial(np.random.choice, size=5, replace=False)
sample = lambda x: sample_5(x) if len(x) >= 5 else x
def get_lang_sents(row):
row["langs"] = [lang for lang in row["langs"] if len(lang) > 0]
sents = [
sent
for sent in sent_tokenize(row["full_text"])
if any([lang in sent for lang in row["langs"]])
if len(sent) < 1000
]
row["lang_mentions"] = sents
row["lang_mentions_sample"] = list(sample(sents))
row["tok_len"] = len(enc.encode(str(row["lang_mentions_sample"])))
return row
def load_open_ai_cache():
with open("raw_openai_resp_cache", "r") as f:
raw = f.read()
entries = [
line.replace("[BEG_WILL]", "").split("[MID_WILL]")
for line in raw.split("[END_WILL]")
]
c = {entry[0]: json.loads(entry[1]) for entry in entries if len(entry) == 2}
return c
response_cache = load_open_ai_cache()
def get_langs_chatgpt(row):
global response_cache
engine = "chatgpt0613"
if not len(row["lang_mentions_sample"]) > 0:
row["open_ai_resp"] = str({})
return row
elif row["acl_id"] in response_cache:
row["open_ai_resp"] = str(response_cache[row["acl_id"]])
return row
else:
row["open_ai_resp"] = str({})
return row
# time.sleep(0.1)
# input_msgs = [
# {
# "role": "system",
# "content": "You are a Natural Language Processing expert carefully studying papers from ACL. On each line, only return valid Python set.",
# },
# {
# "role": "user",
# "content": str(
# 'What are the primary languages of interest from this set of a paper with these sentences? Ignore languages that are only mentioned in passing, for example mentions like "Unlike English": should not lead to English being included in the set. \n Sentences: '
# + "\n".join(row["lang_mentions_sample"])
# ),
# },
# ]
# row["open_ai_resp"] = str(
# openai.ChatCompletion.create(
# engine=engine, messages=input_msgs, temperature=0, stop="\n"
# )
# )
# file1 = open("raw_openai_resp_cache", "a") # append mode
# file1.write(f'[BEG_WILL]{row["acl_id"]}[MID_WILL]{row["open_ai_resp"]}[END_WILL]')
# file1.close()
# return row
def filter_using_openai(row):
final_langs = []
for lang in set(row["langs"]):
# 'Chinese' is linguistically ambiguous, but pervasive
# Map to Mandarin here as that is the most frequent sense
if lang == "Mandarin" and "Chinese" in row["open_ai_resp"]:
final_langs.append(lang)
if lang in row["open_ai_resp"]:
final_langs.append(lang)
row["final_langs"] = final_langs
return row
if __name__ == "__main__":
import pickle as pkl
if not os.path.isfile("cache"):
df = pkl.load(
open(
"acl-publication-info.74k.v3.full-sections-partial-topic-labels.pkl",
"rb",
)
)
rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
ray.data.DataContext.get_current().execution_options.verbose_progress = True
rdf = add_json(rdf)
rdf = add_country(rdf)
with open("cache", "wb") as f:
pkl.dump(rdf.to_pandas(), f)
with open("cache", "rb") as f:
df = pkl.load(f)
df["countries"] = df["countries"].apply("[SEP]".join)
df = df.astype(str)
rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
rdf = rdf.map(re_list)
if not os.path.isfile("lang_cache"):
rdf = rdf.map(check_lang)
with open("lang_cache", "wb") as f:
pkl.dump(rdf.to_pandas(), f)
with open("lang_cache", "rb") as f:
df = pkl.load(f)
df["countries"] = df["countries"].apply("[SEP]".join)
df["langs"] = df["langs"].apply("[SEP]".join)
df = df.astype(str)
rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
rdf = rdf.map(re_list)
if not os.path.isfile("open_ai_cache"):
rdf = rdf.map(get_lang_sents).map(
get_langs_chatgpt, compute=ray.data.ActorPoolStrategy(size=2)
)
df = rdf.to_pandas()
with open("open_ai_cache", "wb") as f:
pkl.dump(df, f)
with open("open_ai_cache", "rb") as f:
df = pkl.load(f)
df["countries"] = df["countries"].apply("[SEP]".join)
df["langs"] = df["langs"].apply("[SEP]".join)
df = df.astype(str)
df["open_ai_resp"] = df.open_ai_resp.apply(ast.literal_eval).apply(
lambda x: x["choices"][0]["message"]["content"] if "choices" in x else ""
)
rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
rdf = rdf.map(re_list).map(filter_using_openai)
df = rdf.to_pandas()
code.InteractiveConsole(locals=globals()).interact()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment