Last active
May 6, 2024 17:33
-
-
Save Helw150/fc1460eca13e4c1657aa86b5ca84d5cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ast | |
# To Delete After Debug | |
import code | |
import copyreg | |
import datetime | |
import functools | |
import json | |
import os | |
import re | |
import time | |
from ast import literal_eval | |
from collections import defaultdict | |
from urllib.parse import urlsplit | |
import gnureadline | |
import numpy as np | |
import openai | |
import pandas as pd | |
import ray | |
import tiktoken | |
from countryguess import guess_country | |
from fuzzysearch import find_near_matches | |
from nltk.tokenize import sent_tokenize | |
from tqdm import tqdm | |
def load_mapping(): | |
with open("domains.csv", "r") as f: | |
lines = f.readlines() | |
return {line.split(",")[0].strip(): line.split(",")[1].strip() for line in lines} | |
enc = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
def pickle_Encoding(enc): | |
return ( | |
functools.partial( | |
tiktoken.core.Encoding, | |
enc.name, | |
pat_str=enc._pat_str, | |
mergeable_ranks=enc._mergeable_ranks, | |
special_tokens=enc._special_tokens, | |
), | |
(), | |
) | |
copyreg.pickle(tiktoken.core.Encoding, pickle_Encoding) | |
countries = ( | |
"(" | |
+ "|".join( | |
[ | |
re.escape(country) | |
for country in open("list-of-countries.txt", "r").read().split("\n") | |
] | |
+ [","] | |
) | |
+ ")" | |
) | |
# Manually add Chinese since it's not in the taxonomy | |
langs = ( | |
r"\b(" | |
+ "|".join([re.escape(lang) for lang in pd.read_csv("lang2tax.txt").language.array]) | |
+ "|Chinese" | |
+ r")\b" | |
) | |
lang_pattern = re.compile(langs, flags=re.IGNORECASE) | |
uni_db = pd.read_csv("world-universities.csv").astype(str) | |
uni_db["uni_website"] = uni_db["uni_website"].map( | |
lambda x: urlsplit(x).netloc.replace("www.", "") | |
) | |
uni_db = uni_db.set_index(["uni_name"]).sort_index() | |
zcdb = pd.read_csv( | |
"allCountries.txt", | |
sep="\t", | |
dtype=str, | |
names=[ | |
"country_code", | |
"postal_code", | |
"place_name", | |
"admin1_name", | |
"admin1_code", | |
"admin2_name", | |
"admin2_code", | |
"admin3_name", | |
"admin3_code", | |
"latitude", | |
"longitude", | |
"accuracy", | |
], | |
).set_index(["postal_code"]) | |
cedex_pattern = r"CEDEX( [0-9])?" | |
zcdb.index = zcdb.index.map(str).map(lambda x: re.sub(cedex_pattern, "", x)) | |
zcdb = zcdb.astype(str).sort_index() | |
country_pattern = re.compile(countries, flags=re.IGNORECASE) | |
zip_patterns = [ | |
r"[0-9]{4,5}", | |
r"(?i)([a-z0-9][a-z0-9\- ]{0,10}[a-z0-9])", | |
] # First Pattern for the most common formats, second captures all global formats in the GeoNames DB | |
email_mapping = load_mapping() | |
def get_prefixes(acl_id): | |
prefix = acl_id[0:] if acl_id[0].isalpha() else acl_id[5:] | |
first_prefix = prefix[0].upper() | |
second_prefix = prefix.split("-")[0] | |
return first_prefix, second_prefix | |
def get_json(row): | |
acl_id = row["acl_id"] | |
first_prefix, second_prefix = get_prefixes(acl_id) | |
try: | |
row["json"] = open( | |
f"./Base_JSON/prefix{first_prefix}/json/{second_prefix}/{acl_id}.json", "r" | |
).read() | |
except: | |
prior = row["url"].split("/")[-3][-2:] | |
latter = row["url"].split("/")[-1].replace(".pdf", "") | |
print( | |
f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json" | |
) | |
try: | |
row["json"] = open( | |
f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json", | |
"r", | |
).read() | |
except: | |
row["json"] = None | |
return row | |
def get_locations(author): | |
affiliation = author["affiliation"] | |
if "email" in author and author["email"].split(".")[-1] in email_mapping: | |
return [email_mapping[author["email"].split(".")[-1]]] | |
if "location" in affiliation and "country" in affiliation["location"]: | |
countries = [] | |
for country in re.split( | |
country_pattern, | |
affiliation["location"]["country"].replace(" and ", ","), | |
): | |
guess = guess_country(country) | |
if guess: | |
countries.append(guess["name_short"]) | |
if len(countries) > 0: | |
return countries | |
if "location" in affiliation and len(affiliation["location"]) > 0: | |
location = defaultdict(str, affiliation["location"]) | |
full_addr = f"{location['addrLine']} {location['settlement']} {location['region']} {location['postCode']}" | |
if "postCode" in location: | |
code_matches = [] | |
zip_codes = [ | |
match | |
for pattern in zip_patterns | |
for match in re.findall(pattern, full_addr) | |
] | |
zip_codes = [code for code in zip_codes if code in zcdb.index] | |
zipc = zcdb.loc[zip_codes] | |
for code in zipc.iloc: | |
if ( | |
code.place_name in full_addr | |
or (code.admin1_code in full_addr and code.admin1_code.isalpha()) | |
or code.admin1_name in full_addr | |
): | |
code_matches.append(guess_country(code.country_code)["name_short"]) | |
if len(code_matches) > 0: | |
return code_matches | |
if "email" in author and author["email"]: | |
author_website = author["email"].split("@")[-1] | |
if author_website in uni_db["uni_website"].array: | |
return [ | |
guess_country( | |
uni_db[uni_db["uni_website"] == author_website].iloc[0][ | |
"country_code" | |
] | |
)["name_short"] | |
] | |
author_tld_only = ( | |
".".join(author_website.split(".")[-2:]) | |
if len(author_website.split(".")) > 2 | |
else None | |
) | |
if author_tld_only and author_tld_only in uni_db["uni_website"].array: | |
return [ | |
guess_country( | |
uni_db[uni_db["uni_website"] == author_tld_only].iloc[0][ | |
"country_code" | |
] | |
)["name_short"] | |
] | |
tld_only = uni_db.uni_website.map(lambda x: ".".join(x.split(".")[-2:])) | |
if author_website in tld_only.array: | |
return [ | |
guess_country( | |
uni_db[tld_only == author_website].iloc[0]["country_code"] | |
)["name_short"] | |
] | |
if "institution" in affiliation or "laboratory" in affiliation: | |
lab_and_school = f"{affiliation['laboratory']} {affiliation['institution']}" | |
school_matches = [] | |
for uni in uni_db.index.array: | |
cand_match = find_near_matches( | |
uni, lab_and_school, max_l_dist=1, max_substitutions=0, max_deletions=0 | |
) | |
if len(cand_match) > 0: | |
match = cand_match[0].matched | |
school_matches.append((match, uni)) | |
c = school_matches | |
school_matches = [ | |
match[1] | |
for match in school_matches | |
if all( | |
[ | |
(match[0] not in c_match[0] or match[1] == c_match[1]) | |
for c_match in school_matches | |
] | |
) | |
] | |
if len(school_matches) > 0: | |
countries = [] | |
for match in school_matches: | |
country_info = uni_db.loc[match].country_code | |
if type(country_info) == type("test") and country_info != "nan": | |
countries.append(guess_country(country_info)["name_short"]) | |
if len(countries) > 0: | |
return countries | |
return [] | |
def get_institutions(row): | |
row["countries"] = set( | |
filter( | |
lambda x: str(x) == x, | |
[ | |
location | |
for author in json.loads(row["json"])["authors"] | |
for location in get_locations(author) | |
], | |
) | |
) | |
return row | |
def add_json(rdf): | |
return rdf.map(get_json).filter(lambda row: row["json"] != None) | |
def dedupe_country(row): | |
countries = row["countries"] | |
normalized_countries = [] | |
for country in countries: | |
if country == "Europe": | |
normalized_countries.append(country) | |
else: | |
normalized = guess_country(country) | |
normalized_countries.append(normalized["name_short"]) | |
row["countries"] = set(normalized_countries) | |
return row | |
def add_country(rdf): | |
return ( | |
rdf.map(get_institutions) | |
.filter(lambda row: len(row["countries"]) > 0) | |
.map(dedupe_country) | |
) | |
def check_lang(row): | |
row["langs"] = [ | |
match.lower().capitalize() | |
for match in re.findall(lang_pattern, str(row["full_text"])) | |
] | |
return row | |
def re_list(row): | |
if "countries" in row: | |
row["countries"] = row["countries"].split("[SEP]") | |
if "langs" in row: | |
row["langs"] = row["langs"].split("[SEP]") | |
return row | |
sample_5 = functools.partial(np.random.choice, size=5, replace=False) | |
sample = lambda x: sample_5(x) if len(x) >= 5 else x | |
def get_lang_sents(row): | |
row["langs"] = [lang for lang in row["langs"] if len(lang) > 0] | |
sents = [ | |
sent | |
for sent in sent_tokenize(row["full_text"]) | |
if any([lang in sent for lang in row["langs"]]) | |
if len(sent) < 1000 | |
] | |
row["lang_mentions"] = sents | |
row["lang_mentions_sample"] = list(sample(sents)) | |
row["tok_len"] = len(enc.encode(str(row["lang_mentions_sample"]))) | |
return row | |
def load_open_ai_cache(): | |
with open("raw_openai_resp_cache", "r") as f: | |
raw = f.read() | |
entries = [ | |
line.replace("[BEG_WILL]", "").split("[MID_WILL]") | |
for line in raw.split("[END_WILL]") | |
] | |
c = {entry[0]: json.loads(entry[1]) for entry in entries if len(entry) == 2} | |
return c | |
response_cache = load_open_ai_cache() | |
def get_langs_chatgpt(row): | |
global response_cache | |
engine = "chatgpt0613" | |
if not len(row["lang_mentions_sample"]) > 0: | |
row["open_ai_resp"] = str({}) | |
return row | |
elif row["acl_id"] in response_cache: | |
row["open_ai_resp"] = str(response_cache[row["acl_id"]]) | |
return row | |
else: | |
row["open_ai_resp"] = str({}) | |
return row | |
# time.sleep(0.1) | |
# input_msgs = [ | |
# { | |
# "role": "system", | |
# "content": "You are a Natural Language Processing expert carefully studying papers from ACL. On each line, only return valid Python set.", | |
# }, | |
# { | |
# "role": "user", | |
# "content": str( | |
# 'What are the primary languages of interest from this set of a paper with these sentences? Ignore languages that are only mentioned in passing, for example mentions like "Unlike English": should not lead to English being included in the set. \n Sentences: ' | |
# + "\n".join(row["lang_mentions_sample"]) | |
# ), | |
# }, | |
# ] | |
# row["open_ai_resp"] = str( | |
# openai.ChatCompletion.create( | |
# engine=engine, messages=input_msgs, temperature=0, stop="\n" | |
# ) | |
# ) | |
# file1 = open("raw_openai_resp_cache", "a") # append mode | |
# file1.write(f'[BEG_WILL]{row["acl_id"]}[MID_WILL]{row["open_ai_resp"]}[END_WILL]') | |
# file1.close() | |
# return row | |
def filter_using_openai(row): | |
final_langs = [] | |
for lang in set(row["langs"]): | |
# 'Chinese' is linguistically ambiguous, but pervasive | |
# Map to Mandarin here as that is the most frequent sense | |
if lang == "Mandarin" and "Chinese" in row["open_ai_resp"]: | |
final_langs.append(lang) | |
if lang in row["open_ai_resp"]: | |
final_langs.append(lang) | |
row["final_langs"] = final_langs | |
return row | |
if __name__ == "__main__": | |
import pickle as pkl | |
if not os.path.isfile("cache"): | |
df = pkl.load( | |
open( | |
"acl-publication-info.74k.v3.full-sections-partial-topic-labels.pkl", | |
"rb", | |
) | |
) | |
rdf = ray.data.from_pandas(df).repartition(num_blocks=32) | |
ray.data.DataContext.get_current().execution_options.verbose_progress = True | |
rdf = add_json(rdf) | |
rdf = add_country(rdf) | |
with open("cache", "wb") as f: | |
pkl.dump(rdf.to_pandas(), f) | |
with open("cache", "rb") as f: | |
df = pkl.load(f) | |
df["countries"] = df["countries"].apply("[SEP]".join) | |
df = df.astype(str) | |
rdf = ray.data.from_pandas(df).repartition(num_blocks=32) | |
rdf = rdf.map(re_list) | |
if not os.path.isfile("lang_cache"): | |
rdf = rdf.map(check_lang) | |
with open("lang_cache", "wb") as f: | |
pkl.dump(rdf.to_pandas(), f) | |
with open("lang_cache", "rb") as f: | |
df = pkl.load(f) | |
df["countries"] = df["countries"].apply("[SEP]".join) | |
df["langs"] = df["langs"].apply("[SEP]".join) | |
df = df.astype(str) | |
rdf = ray.data.from_pandas(df).repartition(num_blocks=32) | |
rdf = rdf.map(re_list) | |
if not os.path.isfile("open_ai_cache"): | |
rdf = rdf.map(get_lang_sents).map( | |
get_langs_chatgpt, compute=ray.data.ActorPoolStrategy(size=2) | |
) | |
df = rdf.to_pandas() | |
with open("open_ai_cache", "wb") as f: | |
pkl.dump(df, f) | |
with open("open_ai_cache", "rb") as f: | |
df = pkl.load(f) | |
df["countries"] = df["countries"].apply("[SEP]".join) | |
df["langs"] = df["langs"].apply("[SEP]".join) | |
df = df.astype(str) | |
df["open_ai_resp"] = df.open_ai_resp.apply(ast.literal_eval).apply( | |
lambda x: x["choices"][0]["message"]["content"] if "choices" in x else "" | |
) | |
rdf = ray.data.from_pandas(df).repartition(num_blocks=32) | |
rdf = rdf.map(re_list).map(filter_using_openai) | |
df = rdf.to_pandas() | |
code.InteractiveConsole(locals=globals()).interact() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment