Skip to content

Instantly share code, notes, and snippets.

@duhaime
Created February 20, 2021 22:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save duhaime/aff4876811d00db8c7e5cc6719ca6020 to your computer and use it in GitHub Desktop.
Save duhaime/aff4876811d00db8c7e5cc6719ca6020 to your computer and use it in GitHub Desktop.
Ingest EPO SQL data
import itertools
import argparse
import MySQLdb
import json
import glob
import os
# for usage run `python ingest_epo.py -h`
defaults = {
'user': '',
'password': '',
'host': 'localhost',
'db': 'patstat2020b',
}
parser = argparse.ArgumentParser(description='Load patstat data!')
parser.add_argument('--user', type=str, default=defaults['user'], dest='mysql_user', help='mysql username')
parser.add_argument('--password', type=str, default=defaults['password'], dest='mysql_password', help='mysql password')
parser.add_argument('--host', type=str, default=defaults['host'], dest='mysql_host', help='mysql host')
parser.add_argument('--db', type=str, default=defaults['db'], dest='mysql_db', help='mysql db')
parser.add_argument('--csv', type=str, required=True, help='glob path to CSV files')
parser.add_argument('--test', action='store_true')
args = parser.parse_args()
db = MySQLdb.connect(
host=args.mysql_host,
user=args.mysql_user,
passwd=args.mysql_password,
autocommit=True,
)
# initialize the cursor
cursor = db.cursor()
# create the db
cursor.execute('''
CREATE DATABASE IF NOT EXISTS {}
DEFAULT CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;'''.format(args.mysql_db)
)
# use the db
cursor.execute('use {};'.format(args.mysql_db))
# create the tables
cursor.execute('''
CREATE TABLE IF NOT EXISTS tls201_appln (
appln_id int(11) NOT NULL DEFAULT '0',
appln_auth char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
appln_nr varchar(15) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
appln_kind char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '00',
appln_filing_date date NOT NULL DEFAULT '9999-12-31',
appln_filing_year smallint NOT NULL DEFAULT '9999',
appln_nr_epodoc varchar(20) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
appln_nr_original varchar(100) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
ipr_type char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
receiving_office char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
internat_appln_id int(11) NOT NULL DEFAULT '0',
int_phase varchar(11) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
reg_phase varchar(11) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
nat_phase varchar(11) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
earliest_filing_date date NOT NULL DEFAULT '9999-12-31',
earliest_filing_year smallint NOT NULL DEFAULT '9999',
earliest_filing_id int NOT NULL DEFAULT '0',
earliest_publn_date date NOT NULL DEFAULT '9999-12-31',
earliest_publn_year smallint NOT NULL DEFAULT '9999',
earliest_pat_publn_id int NOT NULL DEFAULT '0',
granted char(1) NOT NULL DEFAULT 'N',
docdb_family_id int NOT NULL DEFAULT '0',
inpadoc_family_id int NOT NULL DEFAULT '0',
docdb_family_size smallint NOT NULL default '0',
nb_citing_docdb_fam smallint NOT NULL default '0',
nb_applicants smallint NOT NULL default '0',
nb_inventors smallint NOT NULL default '0',
PRIMARY KEY (appln_id),
KEY IX_internat_appln_id (internat_appln_id),
KEY IX_appln_auth (appln_auth,appln_nr,appln_kind),
KEY IX_appln_filing_date (appln_filing_date),
KEY IX_appln_kind (appln_kind),
KEY IX_docdb_family_id (docdb_family_id),
KEY IX_inpadoc_family_id (inpadoc_family_id),
KEY IX_docdb_family_id_filing_date (docdb_family_id,appln_filing_date),
KEY IX_inpadoc_family_id_filing_date (inpadoc_family_id,appln_filing_date)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls202_appln_title (
appln_id int(11) NOT NULL DEFAULT '0',
appln_title_lg char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
appln_title text COLLATE utf8mb4_unicode_ci NOT NULL,
PRIMARY KEY (appln_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=600;
CREATE TABLE IF NOT EXISTS tls203_appln_abstr (
appln_id int(11) NOT NULL DEFAULT '0',
appln_abstract_lg char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
appln_abstract text COLLATE utf8mb4_unicode_ci,
PRIMARY KEY (appln_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=800;
CREATE TABLE IF NOT EXISTS tls204_appln_prior (
appln_id int(11) NOT NULL DEFAULT '0',
prior_appln_id int(11) NOT NULL DEFAULT '0',
prior_appln_seq_nr smallint(6) NOT NULL DEFAULT '0',
PRIMARY KEY (appln_id,prior_appln_id),
KEY IX_prior_appln_id (prior_appln_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=800;
CREATE TABLE IF NOT EXISTS tls205_tech_rel (
appln_id int(11) NOT NULL DEFAULT '0',
tech_rel_appln_id int(11) NOT NULL DEFAULT '0',
PRIMARY KEY (appln_id,tech_rel_appln_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=100;
CREATE TABLE IF NOT EXISTS tls206_person (
person_id int NOT NULL DEFAULT '0',
person_name varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL,
person_name_orig_lg varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL,
person_address varchar(1000) COLLATE utf8mb4_unicode_ci NOT NULL,
person_ctry_code char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
nuts char(5) NOT NULL DEFAULT '',
nuts_level smallint NOT NULL DEFAULT '9',
doc_std_name_id int NOT NULL DEFAULT '0',
doc_std_name varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
psn_id int NOT NULL DEFAULT '0',
psn_name varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
psn_level smallint NOT NULL DEFAULT '0',
psn_sector varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
han_id int NOT NULL DEFAULT '0',
han_name varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
han_harmonized int NOT NULL DEFAULT '0',
PRIMARY KEY (person_id),
KEY IX_person_ctry_code (person_ctry_code),
KEY IX_doc_std_name_id (doc_std_name_id),
KEY IX_ppat_nuts (nuts),
KEY IX_ppat_psn_name (psn_name(250)),
KEY IX_ppat_psn_sector (psn_sector),
KEY IX_ppat_psn_id (psn_id),
KEY IX_ppat_han_id (han_id),
KEY IX_han_name (han_name(250)),
KEY IX_han_harmonized (han_harmonized)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=100;
CREATE TABLE IF NOT EXISTS tls207_pers_appln (
person_id int(11) NOT NULL DEFAULT '0',
appln_id int(11) NOT NULL DEFAULT '0',
applt_seq_nr smallint(6) NOT NULL DEFAULT '0',
invt_seq_nr smallint(6) NOT NULL DEFAULT '0',
PRIMARY KEY (person_id,appln_id,applt_seq_nr,invt_seq_nr),
KEY IX_person_id (person_id),
KEY IX_appln_id (appln_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls209_appln_ipc (
appln_id int(11) NOT NULL DEFAULT '0',
ipc_class_symbol varchar(15) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
ipc_class_level char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
ipc_version date NOT NULL DEFAULT '9999-12-31',
ipc_value char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
ipc_position char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
ipc_gener_auth char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (appln_id,ipc_class_symbol,ipc_class_level),
KEY IX_ipc_class_symbol (ipc_class_symbol)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls210_appln_n_cls (
appln_id int(11) NOT NULL DEFAULT '0',
nat_class_symbol varchar(15) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (appln_id,nat_class_symbol)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls211_pat_publn (
pat_publn_id int(11) NOT NULL DEFAULT '0',
publn_auth char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
publn_nr varchar(15) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
publn_nr_original varchar(100) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
publn_kind char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
appln_id int(11) NOT NULL DEFAULT '0',
publn_date date NOT NULL DEFAULT '9999-12-31',
publn_lg char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
publn_first_grant char(1) NOT NULL DEFAULT 'N',
publn_claims smallint(6) DEFAULT NULL,
PRIMARY KEY (pat_publn_id),
KEY IX_publn_auth (publn_auth,publn_nr,publn_kind),
KEY IX_appln_id (appln_id),
KEY IX_publn_date (publn_date),
KEY IX_publ_lg (publn_lg)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls212_citation (
pat_publn_id int(11) NOT NULL DEFAULT '0',
citn_replenished int NOT NULL DEFAULT '0',
citn_id smallint(6) NOT NULL DEFAULT '0',
citn_origin char(5) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
cited_pat_publn_id int(11) NOT NULL DEFAULT '0',
cited_appln_id int(10) unsigned NOT NULL DEFAULT '0',
pat_citn_seq_nr smallint(6) NOT NULL DEFAULT '0',
cited_npl_publn_id int(11) NOT NULL DEFAULT '0',
npl_citn_seq_nr smallint(6) NOT NULL DEFAULT '0',
citn_gener_auth char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (pat_publn_id,citn_replenished,citn_id),
KEY IX_cited_pat_publn_id (cited_pat_publn_id),
KEY cited_appln_id (cited_appln_id,pat_publn_id),
KEY IX_pat_citn_seq_nr (pat_citn_seq_nr),
KEY IX_npl_citn_seq_nr (npl_citn_seq_nr),
KEY IX_cited_npl_publn_id (cited_npl_publn_id),
KEY IX_cited_pub_seq_nr (cited_pat_publn_id,pat_citn_seq_nr),
KEY IX_cited_app_seq_nr (cited_appln_id,pat_citn_seq_nr)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci PACK_KEYS=0;
CREATE TABLE IF NOT EXISTS tls214_npl_publn (
npl_publn_id int(11) NOT NULL DEFAULT '0',
npl_type char(1) NOT NULL DEFAULT '',
npl_biblio longtext COLLATE utf8mb4_unicode_ci NOT NULL,
npl_author nvarchar(1000) NOT NULL DEFAULT '',
npl_title1 nvarchar(1000) NOT NULL DEFAULT '',
npl_title2 nvarchar(1000) NOT NULL DEFAULT '',
npl_editor nvarchar(500) NOT NULL DEFAULT '',
npl_volume varchar(50) NOT NULL DEFAULT '',
npl_issue varchar(50) NOT NULL DEFAULT '',
npl_publn_date varchar(8) NOT NULL DEFAULT '',
npl_publn_end_date varchar(8) NOT NULL DEFAULT '',
npl_publisher nvarchar(500) NOT NULL DEFAULT '',
npl_page_first varchar(200) NOT NULL DEFAULT '',
npl_page_last varchar(200) NOT NULL DEFAULT '',
npl_abstract_nr varchar(50) NOT NULL DEFAULT '',
npl_doi varchar(500) NOT NULL DEFAULT '',
npl_isbn varchar(30) NOT NULL DEFAULT '',
npl_issn varchar(30) NOT NULL DEFAULT '',
online_availability varchar(500) NOT NULL DEFAULT '',
online_classification varchar(35) NOT NULL DEFAULT '',
online_search_date varchar(8) NOT NULL DEFAULT '',
PRIMARY KEY (npl_publn_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=150;
CREATE TABLE IF NOT EXISTS tls215_citn_categ (
pat_publn_id int(11) NOT NULL DEFAULT '0',
citn_replenished int NOT NULL DEFAULT '0',
citn_id smallint(6) NOT NULL DEFAULT '0',
citn_categ char(10) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
relevant_claim smallint(6) NOT NULL DEFAULT 0,
PRIMARY KEY (pat_publn_id,citn_replenished,citn_id,citn_categ,relevant_claim)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls216_appln_contn (
appln_id int(11) NOT NULL DEFAULT '0',
parent_appln_id int(11) NOT NULL DEFAULT '0',
contn_type char(3) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (appln_id,parent_appln_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls231_inpadoc_legal_event (
event_id int NOT NULL DEFAULT '0',
appln_id int(11) NOT NULL DEFAULT '0',
event_seq_nr smallint(6) NOT NULL DEFAULT '0',
event_type char(3) NOT NULL DEFAULT ' ',
event_auth char(2) NOT NULL DEFAULT ' ',
event_code varchar(4) NOT NULL DEFAULT '',
event_filing_date date NOT NULL DEFAULT '9999-12-31',
event_publn_date date NOT NULL DEFAULT '9999-12-31',
event_effective_date date NOT NULL DEFAULT '9999-12-31',
event_text varchar(1000) NOT NULL DEFAULT '',
ref_doc_auth char(2) NOT NULL DEFAULT ' ',
ref_doc_nr varchar(20) NOT NULL DEFAULT '',
ref_doc_kind char(2) NOT NULL DEFAULT ' ',
ref_doc_date date NOT NULL DEFAULT '9999-12-31',
ref_doc_text varchar(1000) NOT NULL DEFAULT '',
party_type varchar(3) NOT NULL DEFAULT ' ',
party_seq_nr smallint NOT NULL default '0',
party_new varchar(1000) NOT NULL DEFAULT '',
party_old varchar(1000) NOT NULL DEFAULT '',
spc_nr varchar(40) NOT NULL DEFAULT '',
spc_filing_date date NOT NULL DEFAULT '9999-12-31',
spc_patent_expiry_date date NOT NULL DEFAULT '9999-12-31',
spc_extension_date date NOT NULL DEFAULT '9999-12-31',
spc_text varchar(1000) NOT NULL DEFAULT '',
designated_states varchar(1000) NOT NULL DEFAULT '',
extension_states varchar(30) NOT NULL DEFAULT '',
fee_country char(2) NOT NULL DEFAULT ' ',
fee_payment_date date NOT NULL DEFAULT '9999-12-31',
fee_renewal_year smallint NOT NULL default '9999',
fee_text varchar(1000) NOT NULL DEFAULT '',
lapse_country char(2) NOT NULL DEFAULT ' ',
lapse_date date NOT NULL DEFAULT '9999-12-31',
lapse_text varchar(1000) NOT NULL DEFAULT '',
reinstate_country char(2) NOT NULL DEFAULT ' ',
reinstate_date date NOT NULL DEFAULT '9999-12-31',
reinstate_text varchar(1000) NOT NULL DEFAULT '',
class_scheme varchar(4) NOT NULL DEFAULT '',
class_symbol varchar(50) NOT NULL DEFAULT '',
PRIMARY KEY (appln_id,event_seq_nr),
KEY event_publn_date (event_publn_date,appln_id),
KEY event_type (event_type,appln_id),
KEY event_code (event_code,appln_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=100;
CREATE TABLE IF NOT EXISTS tls222_appln_jp_class (
appln_id int(11) NOT NULL DEFAULT '0',
jp_class_scheme varchar(5) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
jp_class_symbol varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (appln_id,jp_class_scheme,jp_class_symbol),
KEY jp_class_symbol (jp_class_symbol,jp_class_scheme)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls223_appln_docus (
appln_id int(11) NOT NULL DEFAULT '0',
docus_class_symbol varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (appln_id,docus_class_symbol),
KEY docus_class_symbol (docus_class_symbol)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls224_appln_cpc (
appln_id int(11) NOT NULL DEFAULT '0',
cpc_class_symbol varchar(19) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
cpc_scheme varchar(5) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
cpc_version date NOT NULL DEFAULT '9999-12-31',
cpc_value char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
cpc_position char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
cpc_gener_auth char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (appln_id, cpc_class_symbol, cpc_scheme)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls226_person_orig (
person_orig_id int(11) NOT NULL DEFAULT '0',
person_id int(11) NOT NULL DEFAULT '0',
source char(5) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
source_version varchar(10) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
name_freeform varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
person_name_orig_lg varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL,
last_name varchar(400) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
first_name varchar(200) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
middle_name varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
address_freeform varchar(1000) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
address_1 varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
address_2 varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
address_3 varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
address_4 varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
address_5 varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
street varchar(500) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
city varchar(200) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
zip_code varchar(30) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
state char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
person_ctry_code char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
residence_ctry_code char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
role varchar(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (person_orig_id),
KEY person_id (person_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls227_pers_publn (
person_id int(11) NOT NULL DEFAULT '0',
pat_publn_id int(11) NOT NULL DEFAULT '0',
applt_seq_nr smallint(6) NOT NULL DEFAULT '0',
invt_seq_nr smallint(6) NOT NULL DEFAULT '0',
PRIMARY KEY (person_id,pat_publn_id,applt_seq_nr,invt_seq_nr),
KEY pat_publn_id (pat_publn_id),
KEY person_id (person_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls228_docdb_fam_citn (
docdb_family_id int(11) NOT NULL DEFAULT '0',
cited_docdb_family_id int(11) NOT NULL DEFAULT '0',
PRIMARY KEY (docdb_family_id,cited_docdb_family_id),
KEY docdb_family_id (docdb_family_id),
KEY cited_docdb_family_id (cited_docdb_family_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls229_appln_nace2 (
appln_id int(11) NOT NULL DEFAULT '0',
nace2_code char(5) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
weight float NOT NULL DEFAULT '1',
PRIMARY KEY (appln_id,nace2_code),
KEY nace2_code (nace2_code)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls230_appln_techn_field (
appln_id int(11) NOT NULL DEFAULT '0',
techn_field_nr tinyint NOT NULL DEFAULT '0',
weight float NOT NULL DEFAULT '1',
PRIMARY KEY (appln_id,techn_field_nr)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls801_country (
ctry_code varchar(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
iso_alpha3 varchar(3) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
st3_name varchar(100) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
state_indicator char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
continent varchar(25) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
eu_member char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
epo_member char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
oecd_member char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
discontinued char(1) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (ctry_code)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=100;
CREATE TABLE IF NOT EXISTS tls803_legal_event_code (
event_auth char(2) NOT NULL DEFAULT '',
event_code varchar(4) NOT NULL DEFAULT '',
event_impact char(1) NOT NULL DEFAULT '',
event_descr varchar(250) NOT NULL DEFAULT '',
event_descr_orig varchar(250) NOT NULL DEFAULT '',
event_category_code char(1) NOT NULL DEFAULT '',
event_category_title varchar(100) NOT NULL DEFAULT '',
PRIMARY KEY (event_auth, event_code)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=100;
CREATE TABLE IF NOT EXISTS tls901_techn_field_ipc (
ipc_maingroup_symbol varchar(8) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
techn_field_nr tinyint(4) NOT NULL DEFAULT '0',
techn_sector varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
techn_field varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (techn_field_nr,techn_sector,techn_field,ipc_maingroup_symbol)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=100;
CREATE TABLE IF NOT EXISTS tls902_ipc_nace2 (
ipc varchar(8) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
not_with_ipc varchar(8) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
unless_with_ipc varchar(8) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
nace2_code char(5) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
nace2_weight float NOT NULL DEFAULT '1',
nace2_descr varchar(150) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (ipc,not_with_ipc,unless_with_ipc,nace2_code)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls904_nuts (
nuts varchar(5) NOT NULL,
nuts_level int NOT NULL DEFAULT '0',
nuts_label varchar(250) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (nuts)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS tls909_eee_ppat (
person_id int(11) NOT NULL DEFAULT '0',
person_ctry_code char(2) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
person_name varchar(400) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
hrm_l1 varchar(400) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
hrm_l2 varchar(400) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
hrm_level tinyint(4) NOT NULL DEFAULT '0',
hrm_l2_id int(11) NOT NULL DEFAULT '0',
sector varchar(50) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
person_address varchar(1000) COLLATE utf8mb4_unicode_ci NOT NULL,
doc_std_name_id int(11) NOT NULL DEFAULT '0',
pat_cnt int(11) NOT NULL,
PRIMARY KEY (person_id),
KEY IX_ppat_person_ctry_code (person_ctry_code),
KEY IX_ppat_hrm_l1 (hrm_l1(333)),
KEY IX_ppat_hrm_l2 (hrm_l2(333)),
KEY IX_ppat_sector (sector),
KEY IX_ppat_hrm_l2_id (hrm_l2_id)
) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci AVG_ROW_LENGTH=100;
''')
# if user specified --test, copy the first several lines from each spreadsheet into a tmp file
csv_files = glob.glob(args.csv)
delimiter = '\n'
if not csv_files:
raise Exception('CSV files not found! Please provide a glob of files, e.g. python create_sql.py --csv ./data/*.csv')
if args.test:
if not os.path.exists('tmp'):
os.makedirs('tmp')
for i in csv_files:
print(' * creating minimal dataset from', i)
with open(i) as f:
lines = list(itertools.islice(f, 10)) # slice the first 10 lines
out_path = os.path.join('tmp', os.path.basename(i))
if not os.path.exists(out_path):
with open(out_path, 'w') as out:
out.write(''.join(lines).rstrip())
# use the test files for importing
csv_files = glob.glob(os.path.join('tmp', '*.csv'))
# map each table prefix to the full table name
table_map = {i.split('_')[0]: i for i in '''
tls201_appln
tls202_appln_title
tls204_appln_prior
tls205_tech_rel
tls206_person
tls904_nuts
tls906_person
tls207_pers_appln
tls209_appln_ipc
tls210_appln_n_cls
tls211_pat_publn
tls212_citation
tls214_npl_publn
tls215_citn_categ
tls216_appln_contn
tls222_appln_jp_class
tls223_appln_docus
tls224_appln_cpc
tls226_person_orig
tls227_pers_publn
tls228_docdb_fam_citn
tls229_appln_nace2
tls230_appln_techn_field
tls801_country
tls803_legal_event_code
tls901_techn_field_ipc
tls902_ipc_nace2
tls203_appln_abstr
tls231_inpadoc_legal_event
'''.strip().split()}
# load the csv data
errors = []
for i in csv_files:
try:
table = table_map[os.path.basename(i).split('_')[0]]
except KeyError:
print(' * table missing from schema', i)
continue
print(' * loading data from', i, 'into', table)
# open the file and insert each row
with open(i) as f:
f = iter(f.readlines())
headers = next(f)
for j in f:
try:
cmd = 'INSERT INTO {}({}) VALUES({})'.format(table, headers, j)
cursor.execute(cmd)
except Exception as exc:
# do not halt if a duplicate row is encountered (the duplicate won't be written)
if isinstance(exc, MySQLdb._exceptions.IntegrityError):
continue
if isinstance(exc, MySQLdb._exceptions.OperationalError):
errors.append([i, headers, j])
with open('errors.json', 'w') as out:
json.dump(errors, out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment