Skip to content

Instantly share code, notes, and snippets.

@ncoop57
Created January 26, 2023 05:45
Show Gist options
  • Save ncoop57/f700ea0980ed6d19673440ac19e5c457 to your computer and use it in GitHub Desktop.
Save ncoop57/f700ea0980ed6d19673440ac19e5c457 to your computer and use it in GitHub Desktop.
import boto3
s3 = boto3.resource("s3")
my_bucket = s3.Bucket("s-eai-neox")
file_paths = []
for my_bucket_object in my_bucket.objects.filter(Prefix="data/codepile/group1/"):
# print(my_bucket_object.key)
file_paths.append(f"s3a://s-eai-neox/{my_bucket_object.key}")
print(len(file_paths))
from spark_session_builder import build_spark_session
file_paths = file_paths[100:200]
spark = build_spark_session("spark://cpu64-dy-r6i-16xlarge-9:7077", 32, 256)
data = spark.read.parquet(*file_paths)
data.show()
# data.filter(data.meta.contains("arXiv_out")).show()
data.filter(data.meta.contains("Project Gutenberg")).show()
data.filter(data.meta.contains("Ubuntu IRC")).show()
data.filter(data.meta.contains("USPTO-Application")).show()
data.filter(data.meta.contains("S2ORC")).show()
# data.filter(data.meta.contains("arXiv_out")).show(truncate=False)
# data.filter(data.meta.contains("arXiv_out")).show(truncate=False)
# data.filter(data.meta.contains("arXiv_out")).show(truncate=False)
# "PubMed_ver2" : lambda example : ast.literal_eval(example["meta"]["source"]) == "PubMedDataset",
# "Gutenberg_ver2" : lambda example : ast.literal_eval(example["meta"]["source"]) == "Project Gutenberg",
# "FreeLaw_Options_ver2" : lambda example : "date_created" in example.keys(),
# "UbuntuIRC_ver2" : lambda example : ast.literal_eval(example["meta"]["source"]) == "Ubuntu IRC",
# "Enwiki_ver2" : lambda example : "wikidata_id" in example.keys(),
# "EuroParliamentProceedings_ver2" : lambda example : "language" in example.keys(),
# "USPTO_ver2" : lambda example : ast.literal_eval(example["meta"]["source_data"]) == "USPTO-Application",
# "PileOfLaw_ver2" : lambda example : "dataset" in example.keys(),
# "OtherWiki_ver2" : lambda example : "wiki_source" in example.keys(),
# "S2ORC_ver2" : lambda example : ast.literal_eval(example["meta"]["source"]) == "S2ORC",
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment