Helw150/process_parses.py

## process_parses.py
import ast

# To Delete After Debug
import code
import copyreg
import datetime
import functools
import json
import os
import re
import time
from ast import literal_eval
from collections import defaultdict
from urllib.parse import urlsplit

import gnureadline
import numpy as np
import openai
import pandas as pd
import ray
import tiktoken
from countryguess import guess_country
from fuzzysearch import find_near_matches
from nltk.tokenize import sent_tokenize
from tqdm import tqdm


def load_mapping():
    with open("domains.csv", "r") as f:
        lines = f.readlines()
    return {line.split(",")[0].strip(): line.split(",")[1].strip() for line in lines}


enc = tiktoken.encoding_for_model("gpt-3.5-turbo")


def pickle_Encoding(enc):
    return (
        functools.partial(
            tiktoken.core.Encoding,
            enc.name,
            pat_str=enc._pat_str,
            mergeable_ranks=enc._mergeable_ranks,
            special_tokens=enc._special_tokens,
        ),
        (),
    )


copyreg.pickle(tiktoken.core.Encoding, pickle_Encoding)
countries = (
    "("
    + "|".join(
        [
            re.escape(country)
            for country in open("list-of-countries.txt", "r").read().split("\n")
        ]
        + [","]
    )
    + ")"
)

# Manually add Chinese since it's not in the taxonomy
langs = (
    r"\b("
    + "|".join([re.escape(lang) for lang in pd.read_csv("lang2tax.txt").language.array])
    + "|Chinese"
    + r")\b"
)
lang_pattern = re.compile(langs, flags=re.IGNORECASE)
uni_db = pd.read_csv("world-universities.csv").astype(str)
uni_db["uni_website"] = uni_db["uni_website"].map(
    lambda x: urlsplit(x).netloc.replace("www.", "")
)
uni_db = uni_db.set_index(["uni_name"]).sort_index()
zcdb = pd.read_csv(
    "allCountries.txt",
    sep="\t",
    dtype=str,
    names=[
        "country_code",
        "postal_code",
        "place_name",
        "admin1_name",
        "admin1_code",
        "admin2_name",
        "admin2_code",
        "admin3_name",
        "admin3_code",
        "latitude",
        "longitude",
        "accuracy",
    ],
).set_index(["postal_code"])
cedex_pattern = r"CEDEX( [0-9])?"
zcdb.index = zcdb.index.map(str).map(lambda x: re.sub(cedex_pattern, "", x))
zcdb = zcdb.astype(str).sort_index()
country_pattern = re.compile(countries, flags=re.IGNORECASE)
zip_patterns = [
    r"[0-9]{4,5}",
    r"(?i)([a-z0-9][a-z0-9\- ]{0,10}[a-z0-9])",
]  # First Pattern for the most common formats, second captures all global formats in the GeoNames DB
email_mapping = load_mapping()


def get_prefixes(acl_id):
    prefix = acl_id[0:] if acl_id[0].isalpha() else acl_id[5:]
    first_prefix = prefix[0].upper()
    second_prefix = prefix.split("-")[0]
    return first_prefix, second_prefix


def get_json(row):
    acl_id = row["acl_id"]
    first_prefix, second_prefix = get_prefixes(acl_id)
    try:
        row["json"] = open(
            f"./Base_JSON/prefix{first_prefix}/json/{second_prefix}/{acl_id}.json", "r"
        ).read()
    except:
        prior = row["url"].split("/")[-3][-2:]
        latter = row["url"].split("/")[-1].replace(".pdf", "")
        print(
            f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json"
        )
        try:
            row["json"] = open(
                f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json",
                "r",
            ).read()
        except:
            row["json"] = None
    return row


def get_locations(author):
    affiliation = author["affiliation"]
    if "email" in author and author["email"].split(".")[-1] in email_mapping:
        return [email_mapping[author["email"].split(".")[-1]]]
    if "location" in affiliation and "country" in affiliation["location"]:
        countries = []
        for country in re.split(
            country_pattern,
            affiliation["location"]["country"].replace(" and ", ","),
        ):
            guess = guess_country(country)
            if guess:
                countries.append(guess["name_short"])
        if len(countries) > 0:
            return countries
    if "location" in affiliation and len(affiliation["location"]) > 0:
        location = defaultdict(str, affiliation["location"])
        full_addr = f"{location['addrLine']} {location['settlement']} {location['region']} {location['postCode']}"
        if "postCode" in location:
            code_matches = []
            zip_codes = [
                match
                for pattern in zip_patterns
                for match in re.findall(pattern, full_addr)
            ]
            zip_codes = [code for code in zip_codes if code in zcdb.index]
            zipc = zcdb.loc[zip_codes]
            for code in zipc.iloc:
                if (
                    code.place_name in full_addr
                    or (code.admin1_code in full_addr and code.admin1_code.isalpha())
                    or code.admin1_name in full_addr
                ):
                    code_matches.append(guess_country(code.country_code)["name_short"])
            if len(code_matches) > 0:
                return code_matches

    if "email" in author and author["email"]:
        author_website = author["email"].split("@")[-1]
        if author_website in uni_db["uni_website"].array:
            return [
                guess_country(
                    uni_db[uni_db["uni_website"] == author_website].iloc[0][
                        "country_code"
                    ]
                )["name_short"]
            ]
        author_tld_only = (
            ".".join(author_website.split(".")[-2:])
            if len(author_website.split(".")) > 2
            else None
        )

        if author_tld_only and author_tld_only in uni_db["uni_website"].array:
            return [
                guess_country(
                    uni_db[uni_db["uni_website"] == author_tld_only].iloc[0][
                        "country_code"
                    ]
                )["name_short"]
            ]

        tld_only = uni_db.uni_website.map(lambda x: ".".join(x.split(".")[-2:]))
        if author_website in tld_only.array:
            return [
                guess_country(
                    uni_db[tld_only == author_website].iloc[0]["country_code"]
                )["name_short"]
            ]

    if "institution" in affiliation or "laboratory" in affiliation:
        lab_and_school = f"{affiliation['laboratory']} {affiliation['institution']}"
        school_matches = []
        for uni in uni_db.index.array:
            cand_match = find_near_matches(
                uni, lab_and_school, max_l_dist=1, max_substitutions=0, max_deletions=0
            )
            if len(cand_match) > 0:
                match = cand_match[0].matched
                school_matches.append((match, uni))
        c = school_matches
        school_matches = [
            match[1]
            for match in school_matches
            if all(
                [
                    (match[0] not in c_match[0] or match[1] == c_match[1])
                    for c_match in school_matches
                ]
            )
        ]

        if len(school_matches) > 0:
            countries = []
            for match in school_matches:
                country_info = uni_db.loc[match].country_code
                if type(country_info) == type("test") and country_info != "nan":
                    countries.append(guess_country(country_info)["name_short"])
            if len(countries) > 0:
                return countries

    return []


def get_institutions(row):
    row["countries"] = set(
        filter(
            lambda x: str(x) == x,
            [
                location
                for author in json.loads(row["json"])["authors"]
                for location in get_locations(author)
            ],
        )
    )
    return row


def add_json(rdf):
    return rdf.map(get_json).filter(lambda row: row["json"] != None)


def dedupe_country(row):
    countries = row["countries"]
    normalized_countries = []
    for country in countries:
        if country == "Europe":
            normalized_countries.append(country)
        else:
            normalized = guess_country(country)
            normalized_countries.append(normalized["name_short"])
    row["countries"] = set(normalized_countries)
    return row


def add_country(rdf):
    return (
        rdf.map(get_institutions)
        .filter(lambda row: len(row["countries"]) > 0)
        .map(dedupe_country)
    )


def check_lang(row):
    row["langs"] = [
        match.lower().capitalize()
        for match in re.findall(lang_pattern, str(row["full_text"]))
    ]
    return row


def re_list(row):
    if "countries" in row:
        row["countries"] = row["countries"].split("[SEP]")
    if "langs" in row:
        row["langs"] = row["langs"].split("[SEP]")
    return row


sample_5 = functools.partial(np.random.choice, size=5, replace=False)
sample = lambda x: sample_5(x) if len(x) >= 5 else x


def get_lang_sents(row):
    row["langs"] = [lang for lang in row["langs"] if len(lang) > 0]
    sents = [
        sent
        for sent in sent_tokenize(row["full_text"])
        if any([lang in sent for lang in row["langs"]])
        if len(sent) < 1000
    ]
    row["lang_mentions"] = sents
    row["lang_mentions_sample"] = list(sample(sents))
    row["tok_len"] = len(enc.encode(str(row["lang_mentions_sample"])))
    return row


def load_open_ai_cache():
    with open("raw_openai_resp_cache", "r") as f:
        raw = f.read()
    entries = [
        line.replace("[BEG_WILL]", "").split("[MID_WILL]")
        for line in raw.split("[END_WILL]")
    ]
    c = {entry[0]: json.loads(entry[1]) for entry in entries if len(entry) == 2}
    return c


response_cache = load_open_ai_cache()


def get_langs_chatgpt(row):
    global response_cache
    engine = "chatgpt0613"
    if not len(row["lang_mentions_sample"]) > 0:
        row["open_ai_resp"] = str({})
        return row
    elif row["acl_id"] in response_cache:
        row["open_ai_resp"] = str(response_cache[row["acl_id"]])
        return row
    else:
        row["open_ai_resp"] = str({})
        return row
    # time.sleep(0.1)
    # input_msgs = [
    #     {
    #         "role": "system",
    #         "content": "You are a Natural Language Processing expert carefully studying papers from ACL. On each line, only return valid Python set.",
    #     },
    #     {
    #         "role": "user",
    #         "content": str(
    #             'What are the primary languages of interest from this set of a paper with these sentences? Ignore languages that are only mentioned in passing, for example mentions like "Unlike English": should not lead to English being included in the set. \n Sentences: '
    #             + "\n".join(row["lang_mentions_sample"])
    #         ),
    #     },
    # ]
    # row["open_ai_resp"] = str(
    #     openai.ChatCompletion.create(
    #         engine=engine, messages=input_msgs, temperature=0, stop="\n"
    #     )
    # )
    # file1 = open("raw_openai_resp_cache", "a")  # append mode
    # file1.write(f'[BEG_WILL]{row["acl_id"]}[MID_WILL]{row["open_ai_resp"]}[END_WILL]')
    # file1.close()
    # return row


def filter_using_openai(row):
    final_langs = []
    for lang in set(row["langs"]):
        # 'Chinese' is linguistically ambiguous, but pervasive
        # Map to Mandarin here as that is the most frequent sense
        if lang == "Mandarin" and "Chinese" in row["open_ai_resp"]:
            final_langs.append(lang)
        if lang in row["open_ai_resp"]:
            final_langs.append(lang)
    row["final_langs"] = final_langs
    return row


if __name__ == "__main__":
    import pickle as pkl

    if not os.path.isfile("cache"):
        df = pkl.load(
            open(
                "acl-publication-info.74k.v3.full-sections-partial-topic-labels.pkl",
                "rb",
            )
        )
        rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
        ray.data.DataContext.get_current().execution_options.verbose_progress = True
        rdf = add_json(rdf)
        rdf = add_country(rdf)
        with open("cache", "wb") as f:
            pkl.dump(rdf.to_pandas(), f)
    with open("cache", "rb") as f:
        df = pkl.load(f)
        df["countries"] = df["countries"].apply("[SEP]".join)
        df = df.astype(str)
        rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
        rdf = rdf.map(re_list)

    if not os.path.isfile("lang_cache"):
        rdf = rdf.map(check_lang)
        with open("lang_cache", "wb") as f:
            pkl.dump(rdf.to_pandas(), f)
    with open("lang_cache", "rb") as f:
        df = pkl.load(f)
        df["countries"] = df["countries"].apply("[SEP]".join)
        df["langs"] = df["langs"].apply("[SEP]".join)
        df = df.astype(str)
        rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
        rdf = rdf.map(re_list)

    if not os.path.isfile("open_ai_cache"):
        rdf = rdf.map(get_lang_sents).map(
            get_langs_chatgpt, compute=ray.data.ActorPoolStrategy(size=2)
        )
        df = rdf.to_pandas()
        with open("open_ai_cache", "wb") as f:
            pkl.dump(df, f)

    with open("open_ai_cache", "rb") as f:
        df = pkl.load(f)
        df["countries"] = df["countries"].apply("[SEP]".join)
        df["langs"] = df["langs"].apply("[SEP]".join)
        df = df.astype(str)
        df["open_ai_resp"] = df.open_ai_resp.apply(ast.literal_eval).apply(
            lambda x: x["choices"][0]["message"]["content"] if "choices" in x else ""
        )
        rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
        rdf = rdf.map(re_list).map(filter_using_openai)
        df = rdf.to_pandas()

    code.InteractiveConsole(locals=globals()).interact()
	import ast

	# To Delete After Debug
	import code
	import copyreg
	import datetime
	import functools
	import json
	import os
	import re
	import time
	from ast import literal_eval
	from collections import defaultdict
	from urllib.parse import urlsplit

	import gnureadline
	import numpy as np
	import openai
	import pandas as pd
	import ray
	import tiktoken
	from countryguess import guess_country
	from fuzzysearch import find_near_matches
	from nltk.tokenize import sent_tokenize
	from tqdm import tqdm


	def load_mapping():
	with open("domains.csv", "r") as f:
	lines = f.readlines()
	return {line.split(",")[0].strip(): line.split(",")[1].strip() for line in lines}


	enc = tiktoken.encoding_for_model("gpt-3.5-turbo")


	def pickle_Encoding(enc):
	return (
	functools.partial(
	tiktoken.core.Encoding,
	enc.name,
	pat_str=enc._pat_str,
	mergeable_ranks=enc._mergeable_ranks,
	special_tokens=enc._special_tokens,
	),
	(),
	)


	copyreg.pickle(tiktoken.core.Encoding, pickle_Encoding)
	countries = (
	"("
	+ "\|".join(
	[
	re.escape(country)
	for country in open("list-of-countries.txt", "r").read().split("\n")
	]
	+ [","]
	)
	+ ")"
	)

	# Manually add Chinese since it's not in the taxonomy
	langs = (
	r"\b("
	+ "\|".join([re.escape(lang) for lang in pd.read_csv("lang2tax.txt").language.array])
	+ "\|Chinese"
	+ r")\b"
	)
	lang_pattern = re.compile(langs, flags=re.IGNORECASE)
	uni_db = pd.read_csv("world-universities.csv").astype(str)
	uni_db["uni_website"] = uni_db["uni_website"].map(
	lambda x: urlsplit(x).netloc.replace("www.", "")
	)
	uni_db = uni_db.set_index(["uni_name"]).sort_index()
	zcdb = pd.read_csv(
	"allCountries.txt",
	sep="\t",
	dtype=str,
	names=[
	"country_code",
	"postal_code",
	"place_name",
	"admin1_name",
	"admin1_code",
	"admin2_name",
	"admin2_code",
	"admin3_name",
	"admin3_code",
	"latitude",
	"longitude",
	"accuracy",
	],
	).set_index(["postal_code"])
	cedex_pattern = r"CEDEX( [0-9])?"
	zcdb.index = zcdb.index.map(str).map(lambda x: re.sub(cedex_pattern, "", x))
	zcdb = zcdb.astype(str).sort_index()
	country_pattern = re.compile(countries, flags=re.IGNORECASE)
	zip_patterns = [
	r"[0-9]{4,5}",
	r"(?i)([a-z0-9][a-z0-9\- ]{0,10}[a-z0-9])",
	] # First Pattern for the most common formats, second captures all global formats in the GeoNames DB
	email_mapping = load_mapping()


	def get_prefixes(acl_id):
	prefix = acl_id[0:] if acl_id[0].isalpha() else acl_id[5:]
	first_prefix = prefix[0].upper()
	second_prefix = prefix.split("-")[0]
	return first_prefix, second_prefix


	def get_json(row):
	acl_id = row["acl_id"]
	first_prefix, second_prefix = get_prefixes(acl_id)
	try:
	row["json"] = open(
	f"./Base_JSON/prefix{first_prefix}/json/{second_prefix}/{acl_id}.json", "r"
	).read()
	except:
	prior = row["url"].split("/")[-3][-2:]
	latter = row["url"].split("/")[-1].replace(".pdf", "")
	print(
	f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json"
	)
	try:
	row["json"] = open(
	f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json",
	"r",
	).read()
	except:
	row["json"] = None
	return row


	def get_locations(author):
	affiliation = author["affiliation"]
	if "email" in author and author["email"].split(".")[-1] in email_mapping:
	return [email_mapping[author["email"].split(".")[-1]]]
	if "location" in affiliation and "country" in affiliation["location"]:
	countries = []
	for country in re.split(
	country_pattern,
	affiliation["location"]["country"].replace(" and ", ","),
	):
	guess = guess_country(country)
	if guess:
	countries.append(guess["name_short"])
	if len(countries) > 0:
	return countries
	if "location" in affiliation and len(affiliation["location"]) > 0:
	location = defaultdict(str, affiliation["location"])
	full_addr = f"{location['addrLine']} {location['settlement']} {location['region']} {location['postCode']}"
	if "postCode" in location:
	code_matches = []
	zip_codes = [
	match
	for pattern in zip_patterns
	for match in re.findall(pattern, full_addr)
	]
	zip_codes = [code for code in zip_codes if code in zcdb.index]
	zipc = zcdb.loc[zip_codes]
	for code in zipc.iloc:
	if (
	code.place_name in full_addr
	or (code.admin1_code in full_addr and code.admin1_code.isalpha())
	or code.admin1_name in full_addr
	):
	code_matches.append(guess_country(code.country_code)["name_short"])
	if len(code_matches) > 0:
	return code_matches

	if "email" in author and author["email"]:
	author_website = author["email"].split("@")[-1]
	if author_website in uni_db["uni_website"].array:
	return [
	guess_country(
	uni_db[uni_db["uni_website"] == author_website].iloc[0][
	"country_code"
	]
	)["name_short"]
	]
	author_tld_only = (
	".".join(author_website.split(".")[-2:])
	if len(author_website.split(".")) > 2
	else None
	)

	if author_tld_only and author_tld_only in uni_db["uni_website"].array:
	return [
	guess_country(
	uni_db[uni_db["uni_website"] == author_tld_only].iloc[0][
	"country_code"
	]
	)["name_short"]
	]

	tld_only = uni_db.uni_website.map(lambda x: ".".join(x.split(".")[-2:]))
	if author_website in tld_only.array:
	return [
	guess_country(
	uni_db[tld_only == author_website].iloc[0]["country_code"]
	)["name_short"]
	]

	if "institution" in affiliation or "laboratory" in affiliation:
	lab_and_school = f"{affiliation['laboratory']} {affiliation['institution']}"
	school_matches = []
	for uni in uni_db.index.array:
	cand_match = find_near_matches(
	uni, lab_and_school, max_l_dist=1, max_substitutions=0, max_deletions=0
	)
	if len(cand_match) > 0:
	match = cand_match[0].matched
	school_matches.append((match, uni))
	c = school_matches
	school_matches = [
	match[1]
	for match in school_matches
	if all(
	[
	(match[0] not in c_match[0] or match[1] == c_match[1])
	for c_match in school_matches
	]
	)
	]

	if len(school_matches) > 0:
	countries = []
	for match in school_matches:
	country_info = uni_db.loc[match].country_code
	if type(country_info) == type("test") and country_info != "nan":
	countries.append(guess_country(country_info)["name_short"])
	if len(countries) > 0:
	return countries

	return []


	def get_institutions(row):
	row["countries"] = set(
	filter(
	lambda x: str(x) == x,
	[
	location
	for author in json.loads(row["json"])["authors"]
	for location in get_locations(author)
	],
	)
	)
	return row


	def add_json(rdf):
	return rdf.map(get_json).filter(lambda row: row["json"] != None)


	def dedupe_country(row):
	countries = row["countries"]
	normalized_countries = []
	for country in countries:
	if country == "Europe":
	normalized_countries.append(country)
	else:
	normalized = guess_country(country)
	normalized_countries.append(normalized["name_short"])
	row["countries"] = set(normalized_countries)
	return row


	def add_country(rdf):
	return (
	rdf.map(get_institutions)
	.filter(lambda row: len(row["countries"]) > 0)
	.map(dedupe_country)
	)


	def check_lang(row):
	row["langs"] = [
	match.lower().capitalize()
	for match in re.findall(lang_pattern, str(row["full_text"]))
	]
	return row


	def re_list(row):
	if "countries" in row:
	row["countries"] = row["countries"].split("[SEP]")
	if "langs" in row:
	row["langs"] = row["langs"].split("[SEP]")
	return row


	sample_5 = functools.partial(np.random.choice, size=5, replace=False)
	sample = lambda x: sample_5(x) if len(x) >= 5 else x


	def get_lang_sents(row):
	row["langs"] = [lang for lang in row["langs"] if len(lang) > 0]
	sents = [
	sent
	for sent in sent_tokenize(row["full_text"])
	if any([lang in sent for lang in row["langs"]])
	if len(sent) < 1000
	]
	row["lang_mentions"] = sents
	row["lang_mentions_sample"] = list(sample(sents))
	row["tok_len"] = len(enc.encode(str(row["lang_mentions_sample"])))
	return row


	def load_open_ai_cache():
	with open("raw_openai_resp_cache", "r") as f:
	raw = f.read()
	entries = [
	line.replace("[BEG_WILL]", "").split("[MID_WILL]")
	for line in raw.split("[END_WILL]")
	]
	c = {entry[0]: json.loads(entry[1]) for entry in entries if len(entry) == 2}
	return c


	response_cache = load_open_ai_cache()


	def get_langs_chatgpt(row):
	global response_cache
	engine = "chatgpt0613"
	if not len(row["lang_mentions_sample"]) > 0:
	row["open_ai_resp"] = str({})
	return row
	elif row["acl_id"] in response_cache:
	row["open_ai_resp"] = str(response_cache[row["acl_id"]])
	return row
	else:
	row["open_ai_resp"] = str({})
	return row
	# time.sleep(0.1)
	# input_msgs = [
	# {
	# "role": "system",
	# "content": "You are a Natural Language Processing expert carefully studying papers from ACL. On each line, only return valid Python set.",
	# },
	# {
	# "role": "user",
	# "content": str(
	# 'What are the primary languages of interest from this set of a paper with these sentences? Ignore languages that are only mentioned in passing, for example mentions like "Unlike English": should not lead to English being included in the set. \n Sentences: '
	# + "\n".join(row["lang_mentions_sample"])
	# ),
	# },
	# ]
	# row["open_ai_resp"] = str(
	# openai.ChatCompletion.create(
	# engine=engine, messages=input_msgs, temperature=0, stop="\n"
	# )
	# )
	# file1 = open("raw_openai_resp_cache", "a") # append mode
	# file1.write(f'[BEG_WILL]{row["acl_id"]}[MID_WILL]{row["open_ai_resp"]}[END_WILL]')
	# file1.close()
	# return row


	def filter_using_openai(row):
	final_langs = []
	for lang in set(row["langs"]):
	# 'Chinese' is linguistically ambiguous, but pervasive
	# Map to Mandarin here as that is the most frequent sense
	if lang == "Mandarin" and "Chinese" in row["open_ai_resp"]:
	final_langs.append(lang)
	if lang in row["open_ai_resp"]:
	final_langs.append(lang)
	row["final_langs"] = final_langs
	return row


	if __name__ == "__main__":
	import pickle as pkl

	if not os.path.isfile("cache"):
	df = pkl.load(
	open(
	"acl-publication-info.74k.v3.full-sections-partial-topic-labels.pkl",
	"rb",
	)
	)
	rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
	ray.data.DataContext.get_current().execution_options.verbose_progress = True
	rdf = add_json(rdf)
	rdf = add_country(rdf)
	with open("cache", "wb") as f:
	pkl.dump(rdf.to_pandas(), f)
	with open("cache", "rb") as f:
	df = pkl.load(f)
	df["countries"] = df["countries"].apply("[SEP]".join)
	df = df.astype(str)
	rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
	rdf = rdf.map(re_list)

	if not os.path.isfile("lang_cache"):
	rdf = rdf.map(check_lang)
	with open("lang_cache", "wb") as f:
	pkl.dump(rdf.to_pandas(), f)
	with open("lang_cache", "rb") as f:
	df = pkl.load(f)
	df["countries"] = df["countries"].apply("[SEP]".join)
	df["langs"] = df["langs"].apply("[SEP]".join)
	df = df.astype(str)
	rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
	rdf = rdf.map(re_list)

	if not os.path.isfile("open_ai_cache"):
	rdf = rdf.map(get_lang_sents).map(
	get_langs_chatgpt, compute=ray.data.ActorPoolStrategy(size=2)
	)
	df = rdf.to_pandas()
	with open("open_ai_cache", "wb") as f:
	pkl.dump(df, f)

	with open("open_ai_cache", "rb") as f:
	df = pkl.load(f)
	df["countries"] = df["countries"].apply("[SEP]".join)
	df["langs"] = df["langs"].apply("[SEP]".join)
	df = df.astype(str)
	df["open_ai_resp"] = df.open_ai_resp.apply(ast.literal_eval).apply(
	lambda x: x["choices"][0]["message"]["content"] if "choices" in x else ""
	)
	rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
	rdf = rdf.map(re_list).map(filter_using_openai)
	df = rdf.to_pandas()

	code.InteractiveConsole(locals=globals()).interact()