Skip to content

Instantly share code, notes, and snippets.

View richard-orr's full-sized avatar
🤠

Richard Orr richard-orr

🤠
View GitHub Profile
import csv
import glob
import gzip
import json
import os
SNAPSHOT_DIR = 'openalex-snapshot'
CSV_DIR = 'csv-files'
FILES_PER_ENTITY = int(os.environ.get('OPENALEX_DEMO_FILES_PER_ENTITY', '0'))
{
"doi": "10.1242/jcs.241513",
"doi_url": "https://doi.org/10.1242/jcs.241513",
"title": "Regulation of intrinsic polarity establishment by a differentiation-type MAPK pathway",
"genre": "journal-article",
"is_paratext": false,
"published_date": "2020-01-01",
"year": 2020,
"journal_name": "Journal of Cell Science",
"journal_issns": "1477-9137,0021-9533",
@richard-orr
richard-orr / copy-openalex-csv.sql
Last active March 14, 2023 20:49
load OpenAlex CSV files to PostgresSQL
--institutions
\copy openalex.institutions (id, ror, display_name, country_code, type, homepage_url, image_url, image_thumbnail_url, display_name_acroynyms, display_name_alternatives, works_count, cited_by_count, works_api_url, updated_date) from program 'gunzip -c csv-files/institutions.csv.gz' csv header
\copy openalex.institutions_ids (institution_id, openalex, ror, grid, wikipedia, wikidata, mag) from program 'gunzip -c csv-files/institutions_ids.csv.gz' csv header
\copy openalex.institutions_geo (institution_id, city, geonames_city_id, region, country_code, country, latitude, longitude) from program 'gunzip -c csv-files/institutions_geo.csv.gz' csv header
\copy openalex.institutions_associated_institutions (institution_id, associated_institution_id, relationship) from program 'gunzip -c csv-files/institutions_associated_institutions.csv.gz' csv header
\copy openalex.institutions_counts_by_year (institution_id, year, works_count, cited_by_count) from program 'gunzip -c csv-files/institutions_counts_by_ye
@richard-orr
richard-orr / flatten-openalex-jsonl.py
Last active March 15, 2023 02:27
flatten openalex JSON Lines files to CSV readable by PostgreSQL
import csv
import glob
import gzip
import json
import os
SNAPSHOT_DIR = 'openalex-snapshot'
CSV_DIR = 'csv-files'
FILES_PER_ENTITY = int(os.environ.get('OPENALEX_DEMO_FILES_PER_ENTITY', '0'))
@richard-orr
richard-orr / openalex-pg-schema.sql
Last active March 14, 2023 20:46
create openalex pg schema in postgres
--
-- PostgreSQL database dump
--
-- Dumped from database version 13.5 (Ubuntu 13.5-2.heroku1+1)
-- Dumped by pg_dump version 14.1
SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
window_start | window_end | requests_per_hr
----------------------------+----------------------------+-----------------
2020-04-30 00:45:21.774789 | 2020-04-30 12:45:21.774789 | 133680
2020-04-29 22:45:23.355262 | 2020-04-30 10:45:23.355262 | 128614
2020-04-29 20:45:22.787457 | 2020-04-30 08:45:22.787457 | 129403
2020-04-29 18:45:23.303541 | 2020-04-30 06:45:23.303541 | 132036
2020-04-29 16:45:21.376992 | 2020-04-30 04:45:21.376992 | 133006
2020-04-29 14:45:23.256443 | 2020-04-30 02:45:23.256443 | 129461
2020-04-29 12:45:23.311751 | 2020-04-30 00:45:23.311751 | 131144
2020-04-29 10:45:27.438666 | 2020-04-29 22:45:27.438666 | 137141