Skip to content

Instantly share code, notes, and snippets.

@lobstrio
Last active April 14, 2023 19:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lobstrio/5fc088d44bba8383bf3f91acb11ebd3b to your computer and use it in GitHub Desktop.
Save lobstrio/5fc088d44bba8383bf3f91acb11ebd3b to your computer and use it in GitHub Desktop.
Scrape PDFs programmatically site with Python3 and Tika library
from tika import parser
import re
import csv
HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals']
def parse_pdf(filename):
# request
raw = parser.from_file(filename)
# print(raw)
assert isinstance(raw, dict)
status = raw['status']
assert status == 200
# json parsing
metadata = raw['metadata']
content_type = metadata['Content-Type']
creation_date = metadata['Creation-Date']
last_modified = metadata['Last-Modified']
name = metadata['resourceName']
content = str(raw)
numero_gestion = "".join(re.findall(r'(?<=N° de gestion )\w+', content))
a_jour_au = "".join(re.findall(r'(?<=à jour au )[\w\s]+', content))
numero_rcs = "".join(re.findall(r'(?<=Immatriculation au RCS, numéro )[\w\s\.]+', content))
date_immatriculation = "".join(re.findall(r'(?<=Date d\'immatriculation )[\d\/]+', content))
raison_sociale = "".join(re.findall(r'(?<=Dénomination ou raison sociale )\w+', content))
forme_juridique = "".join(re.findall(r'(?<=Forme juridique )[^\\]+', content))
capital_social = "".join(re.findall(r'(?<=Capital social )[^\\]+', content))
adresse_siege = "".join(re.findall(r'(?<=Adresse du siège )[^\\]+', content))
activites_principals = "".join(re.findall(r'(?<=Activités principales )[^\\]+', content))
values = [numero_gestion, a_jour_au, numero_rcs, date_immatriculation, raison_sociale, forme_juridique, capital_social, adresse_siege, activites_principals]
assert all(values)
row_dict = dict(zip(HEADERS, values))
return row_dict
def write_csv(rows):
assert rows
with open('parsed_pdf.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=HEADERS)
writer.writeheader()
for row in rows:
writer.writerow(row)
if __name__ == "__main__":
assert HEADERS
filenames = [
"/Users/sashabouloudnine/Desktop/LOBSTR - Extrait d'immatriculation.pdf",
"/Users/sashabouloudnine/Desktop/VOSTOKINC - Extrait d'immatriculation.pdf",
"/Users/sashabouloudnine/Desktop/CAPTAIN DATA - Extrait d'immatriculation.pdf",
"/Users/sashabouloudnine/Desktop/PHANTOMBUSTER - Extrait d'immatriculation.pdf",
]
rows = []
for filename in filenames:
row_dict = parse_pdf(filename)
rows.append(row_dict)
write_csv(rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment