Last active
November 24, 2022 18:57
-
-
Save ingridstevens/aef42f6ef5d19a73d11ba3c244df0a55 to your computer and use it in GitHub Desktop.
Regex out of a file into a pandas dataframe into csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import PyPDF2 \n", | |
"from PyPDF2 import PdfReader\n", | |
"import os\n", | |
"import re\n", | |
"\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3.5.64 South Africa \n", | |
"Heading Section One\n", | |
"Stock: 110,000\n", | |
"Rank: 11\n", | |
"\n", | |
"Second Heading\n", | |
"Stock: 120,000\n", | |
"Rank: 22\n", | |
"Source and methods \n", | |
"\n", | |
"3.5.64 South Vietnam \n", | |
"\n", | |
"Heading Section One \n", | |
"Stock: 130,000\n", | |
"Rank: 33\n", | |
"\n", | |
"Second Heading\n", | |
"Stock: 140,000\n", | |
"Rank: 44\n", | |
"Source and methods \n", | |
"\n", | |
"3.5.6 South Dakota of Denmark \n", | |
"\n", | |
"Heading Section One \n", | |
"Stock: 130,000\n", | |
"Rank: 33\n", | |
"\n", | |
"Second Heading\n", | |
"Stock: 140,000\n", | |
"Rank: 44\n", | |
"Source and methods \n" | |
] | |
} | |
], | |
"source": [ | |
"# import the file text.txt from the Developer folder\n", | |
"\n", | |
"# file location: /txt.txt\n", | |
"\n", | |
"text_path = \"regex.txt\"\n", | |
"\n", | |
"# load the text file\n", | |
"with open(text_path, 'r') as file:\n", | |
" text = file.read()\n", | |
"\n", | |
"# print the text\n", | |
"print(text)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# NOTE: Only use if you need some text from a PDF \n", | |
"\n", | |
"# pdf_path = \"/path/to.pdf\"\n", | |
"\n", | |
"# # creating a pdf file object\n", | |
"# pdfFileObject = open(pdf_path, 'rb')\n", | |
"\n", | |
"# pdfReader = PyPDF2.PdfFileReader(pdfFileObject)\n", | |
"\n", | |
"# text=''\n", | |
"# for i in range(0,pdfReader.numPages):\n", | |
"# # creating a page object\n", | |
"# pageObj = pdfReader.getPage(i)\n", | |
"# # extracting text from page\n", | |
"# text=text+pageObj.extractText()\n", | |
"# print(text)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3.5.64 South Africa \n", | |
"Heading Section One\n", | |
"Stock: 110,000\n", | |
"Rank: 11\n", | |
"\n", | |
"Second Heading\n", | |
"Stock: 120,000\n", | |
"Rank: 22\n", | |
"Source and methods \n", | |
"\n", | |
"3.5.64 South Vietnam \n", | |
"\n", | |
"Heading Section One \n", | |
"Stock: 130,000\n", | |
"Rank: 33\n", | |
"\n", | |
"Second Heading\n", | |
"Stock: 140,000\n", | |
"Rank: 44\n", | |
"Source and methods \n", | |
"\n", | |
"3.5.6 South Dakota of Denmark \n", | |
"\n", | |
"Heading Section One \n", | |
"Stock: 130,000\n", | |
"Rank: 33\n", | |
"\n", | |
"Second Heading\n", | |
"Stock: 140,000\n", | |
"Rank: 44\n", | |
"Source and methods \n" | |
] | |
} | |
], | |
"source": [ | |
"# import the file text.txt from the Developer folder\n", | |
"\n", | |
"# file location: /blah.txt\n", | |
"\n", | |
"text_path = \"regex.txt\"\n", | |
"\n", | |
"# load the text file\n", | |
"with open(text_path, 'r') as file:\n", | |
" text = file.read()\n", | |
"\n", | |
"# print the text\n", | |
"print(text)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[(' South Africa ', '110,000\\n', ' 11', '120,000\\n', ' 22'), (' South Vietnam ', '130,000\\n', ' 33', '140,000\\n', ' 44'), (' South Dakota of Denmark ', '130,000\\n', ' 33', '140,000\\n', ' 44')]\n" | |
] | |
} | |
], | |
"source": [ | |
"# apply the following regex to the text\n", | |
"\n", | |
"pattern = r\"3.\\d.\\d+([\\S\\s]*?)\\n*Heading Section One(?:[\\S\\s]*?)*\\n*Stock: ([\\S\\s]*?)Rank:([\\S\\s]*?)\\n*Second Heading(?:[\\S\\s]*?)*\\n*Stock: ([\\S\\s]*?)Rank:([\\S\\s]*?)\\n*Source and methods\"\n", | |
"\n", | |
"matches = re.findall(pattern, text, flags = re.MULTILINE)\n", | |
"\n", | |
"\n", | |
"# print the matches\n", | |
"print(matches)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# add each group of matches to a dataframe\n", | |
"\n", | |
"df = pd.DataFrame(matches, columns = ['Country', 'stock_units', 'stock_rank', 'sold_units', 'sold_rank'])\n", | |
"\n", | |
"# add the year column where each year is 2021\n", | |
"df['year'] = 2021\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" Country stock_units stock_rank sold_units sold_rank year\n", | |
"0 South Africa 110,000\\n 11 120,000\\n 22 2021\n", | |
"1 South Vietnam 130,000\\n 33 140,000\\n 44 2021\n", | |
"2 South Dakota of Denmark 130,000\\n 33 140,000\\n 44 2021\n" | |
] | |
} | |
], | |
"source": [ | |
"# print the dataframe\n", | |
"print(df)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# export the dataframe as a csv file\n", | |
"df.to_csv('regex.csv', index = False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.10.1 64-bit", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.1" | |
}, | |
"orig_nbformat": 4, | |
"vscode": { | |
"interpreter": { | |
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3.5.64 South Africa | |
Heading Section One | |
Stock: 110,000 | |
Rank: 11 | |
Second Heading | |
Stock: 120,000 | |
Rank: 22 | |
Source and methods | |
3.5.64 South Vietnam | |
Heading Section One | |
Stock: 130,000 | |
Rank: 33 | |
Second Heading | |
Stock: 140,000 | |
Rank: 44 | |
Source and methods | |
3.5.6 South Dakota of Denmark | |
Heading Section One | |
Stock: 130,000 | |
Rank: 33 | |
Second Heading | |
Stock: 140,000 | |
Rank: 44 | |
Source and methods |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment