Last active
March 27, 2019 10:40
-
-
Save mneedham/51bdaaa1d4ec5b8ec9676462817a4b87 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import json | |
import csv | |
articles = {} | |
authors = set() | |
venues = set() | |
def write_header(file, fields): | |
csv.writer(file, delimiter=",").writerow(fields) | |
with open("data/article_REFERENCES_article.csv", "w") as article_references_article_file, \ | |
open("data/article_REFERENCES_article_header.csv", "w") as article_references_article_header_file, \ | |
open("data/article_AUTHOR_author.csv", "w") as article_author_author_file, \ | |
open("data/article_AUTHOR_author_header.csv", "w") as article_author_author_header_file, \ | |
open("data/article_VENUE_venue.csv", "w") as article_venue_venue_file, \ | |
open("data/article_VENUE_venue_header.csv","w") as article_venue_venue_header_file: | |
write_header(article_references_article_header_file, [":START_ID(Article)", ":END_ID(Article)"]) | |
write_header(article_author_author_header_file, [":START_ID(Article)", ":END_ID(Author)"]) | |
write_header(article_venue_venue_header_file, [":START_ID(Article)", ":END_ID(Venue)"]) | |
articles_references_article_writer = csv.writer(article_references_article_file, delimiter=",") | |
article_author_author_file_writer = csv.writer(article_author_author_file, delimiter=",") | |
article_venue_venue_file_writer = csv.writer(article_venue_venue_file, delimiter=",") | |
for file_path in glob.glob("dblp-ref/*.json"): | |
with open(file_path, "r") as file: | |
line = file.readline() | |
while line: | |
item = json.loads(line) | |
articles[item["id"]] = {"abstract": item.get("abstract", ""), | |
"title": item["title"], "year": item["year"]} | |
venue = item["venue"] | |
if venue: | |
venues.add(venue) | |
article_venue_venue_file_writer.writerow([item["id"], venue]) | |
for reference in item.get("references", []): | |
articles_references_article_writer.writerow([item["id"], reference]) | |
for author in item.get("authors", []): | |
authors.add(author) | |
article_author_author_file_writer.writerow([item["id"], author]) | |
line = file.readline() | |
with open("data/articles.csv", "w") as articles_file, \ | |
open("data/articles_header.csv", "w") as articles_header_file, \ | |
open("data/authors.csv", "w") as authors_file, \ | |
open("data/authors_header.csv", "w") as authors_header_file, \ | |
open("data/venues.csv", "w") as venues_file, \ | |
open("data/venues_header.csv","w") as venues_header_file: | |
write_header(articles_header_file, ["index:ID(Article)", "title:string", "abstract:string", "year:int"]) | |
write_header(authors_header_file, ["name:ID(Author)",]) | |
write_header(venues_header_file, ["name:ID(Venue)"]) | |
articles_writer = csv.writer(articles_file, delimiter=",") | |
for article_id in articles: | |
article = articles[article_id] | |
articles_writer.writerow([article_id, article["title"], article["abstract"], article.get("year")]) | |
authors_writer = csv.writer(authors_file, delimiter=",") | |
for author in authors: | |
authors_writer.writerow([author]) | |
venues_writer = csv.writer(venues_file, delimiter=",") | |
for venue in venues: | |
venues_writer.writerow([venue]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment