Skip to content

Instantly share code, notes, and snippets.

@mneedham
Last active March 27, 2019 10:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mneedham/51bdaaa1d4ec5b8ec9676462817a4b87 to your computer and use it in GitHub Desktop.
Save mneedham/51bdaaa1d4ec5b8ec9676462817a4b87 to your computer and use it in GitHub Desktop.
import glob
import json
import csv
articles = {}
authors = set()
venues = set()
def write_header(file, fields):
csv.writer(file, delimiter=",").writerow(fields)
with open("data/article_REFERENCES_article.csv", "w") as article_references_article_file, \
open("data/article_REFERENCES_article_header.csv", "w") as article_references_article_header_file, \
open("data/article_AUTHOR_author.csv", "w") as article_author_author_file, \
open("data/article_AUTHOR_author_header.csv", "w") as article_author_author_header_file, \
open("data/article_VENUE_venue.csv", "w") as article_venue_venue_file, \
open("data/article_VENUE_venue_header.csv","w") as article_venue_venue_header_file:
write_header(article_references_article_header_file, [":START_ID(Article)", ":END_ID(Article)"])
write_header(article_author_author_header_file, [":START_ID(Article)", ":END_ID(Author)"])
write_header(article_venue_venue_header_file, [":START_ID(Article)", ":END_ID(Venue)"])
articles_references_article_writer = csv.writer(article_references_article_file, delimiter=",")
article_author_author_file_writer = csv.writer(article_author_author_file, delimiter=",")
article_venue_venue_file_writer = csv.writer(article_venue_venue_file, delimiter=",")
for file_path in glob.glob("dblp-ref/*.json"):
with open(file_path, "r") as file:
line = file.readline()
while line:
item = json.loads(line)
articles[item["id"]] = {"abstract": item.get("abstract", ""),
"title": item["title"], "year": item["year"]}
venue = item["venue"]
if venue:
venues.add(venue)
article_venue_venue_file_writer.writerow([item["id"], venue])
for reference in item.get("references", []):
articles_references_article_writer.writerow([item["id"], reference])
for author in item.get("authors", []):
authors.add(author)
article_author_author_file_writer.writerow([item["id"], author])
line = file.readline()
with open("data/articles.csv", "w") as articles_file, \
open("data/articles_header.csv", "w") as articles_header_file, \
open("data/authors.csv", "w") as authors_file, \
open("data/authors_header.csv", "w") as authors_header_file, \
open("data/venues.csv", "w") as venues_file, \
open("data/venues_header.csv","w") as venues_header_file:
write_header(articles_header_file, ["index:ID(Article)", "title:string", "abstract:string", "year:int"])
write_header(authors_header_file, ["name:ID(Author)",])
write_header(venues_header_file, ["name:ID(Venue)"])
articles_writer = csv.writer(articles_file, delimiter=",")
for article_id in articles:
article = articles[article_id]
articles_writer.writerow([article_id, article["title"], article["abstract"], article.get("year")])
authors_writer = csv.writer(authors_file, delimiter=",")
for author in authors:
authors_writer.writerow([author])
venues_writer = csv.writer(venues_file, delimiter=",")
for venue in venues:
venues_writer.writerow([venue])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment