Skip to content

Instantly share code, notes, and snippets.

@ingridstevens
Last active November 24, 2022 18:57
Show Gist options
  • Save ingridstevens/aef42f6ef5d19a73d11ba3c244df0a55 to your computer and use it in GitHub Desktop.
Save ingridstevens/aef42f6ef5d19a73d11ba3c244df0a55 to your computer and use it in GitHub Desktop.
Regex out of a file into a pandas dataframe into csv
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import PyPDF2 \n",
"from PyPDF2 import PdfReader\n",
"import os\n",
"import re\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.5.64 South Africa \n",
"Heading Section One\n",
"Stock: 110,000\n",
"Rank: 11\n",
"\n",
"Second Heading\n",
"Stock: 120,000\n",
"Rank: 22\n",
"Source and methods \n",
"\n",
"3.5.64 South Vietnam \n",
"\n",
"Heading Section One \n",
"Stock: 130,000\n",
"Rank: 33\n",
"\n",
"Second Heading\n",
"Stock: 140,000\n",
"Rank: 44\n",
"Source and methods \n",
"\n",
"3.5.6 South Dakota of Denmark \n",
"\n",
"Heading Section One \n",
"Stock: 130,000\n",
"Rank: 33\n",
"\n",
"Second Heading\n",
"Stock: 140,000\n",
"Rank: 44\n",
"Source and methods \n"
]
}
],
"source": [
"# import the file text.txt from the Developer folder\n",
"\n",
"# file location: /txt.txt\n",
"\n",
"text_path = \"regex.txt\"\n",
"\n",
"# load the text file\n",
"with open(text_path, 'r') as file:\n",
" text = file.read()\n",
"\n",
"# print the text\n",
"print(text)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# NOTE: Only use if you need some text from a PDF \n",
"\n",
"# pdf_path = \"/path/to.pdf\"\n",
"\n",
"# # creating a pdf file object\n",
"# pdfFileObject = open(pdf_path, 'rb')\n",
"\n",
"# pdfReader = PyPDF2.PdfFileReader(pdfFileObject)\n",
"\n",
"# text=''\n",
"# for i in range(0,pdfReader.numPages):\n",
"# # creating a page object\n",
"# pageObj = pdfReader.getPage(i)\n",
"# # extracting text from page\n",
"# text=text+pageObj.extractText()\n",
"# print(text)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.5.64 South Africa \n",
"Heading Section One\n",
"Stock: 110,000\n",
"Rank: 11\n",
"\n",
"Second Heading\n",
"Stock: 120,000\n",
"Rank: 22\n",
"Source and methods \n",
"\n",
"3.5.64 South Vietnam \n",
"\n",
"Heading Section One \n",
"Stock: 130,000\n",
"Rank: 33\n",
"\n",
"Second Heading\n",
"Stock: 140,000\n",
"Rank: 44\n",
"Source and methods \n",
"\n",
"3.5.6 South Dakota of Denmark \n",
"\n",
"Heading Section One \n",
"Stock: 130,000\n",
"Rank: 33\n",
"\n",
"Second Heading\n",
"Stock: 140,000\n",
"Rank: 44\n",
"Source and methods \n"
]
}
],
"source": [
"# import the file text.txt from the Developer folder\n",
"\n",
"# file location: /blah.txt\n",
"\n",
"text_path = \"regex.txt\"\n",
"\n",
"# load the text file\n",
"with open(text_path, 'r') as file:\n",
" text = file.read()\n",
"\n",
"# print the text\n",
"print(text)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(' South Africa ', '110,000\\n', ' 11', '120,000\\n', ' 22'), (' South Vietnam ', '130,000\\n', ' 33', '140,000\\n', ' 44'), (' South Dakota of Denmark ', '130,000\\n', ' 33', '140,000\\n', ' 44')]\n"
]
}
],
"source": [
"# apply the following regex to the text\n",
"\n",
"pattern = r\"3.\\d.\\d+([\\S\\s]*?)\\n*Heading Section One(?:[\\S\\s]*?)*\\n*Stock: ([\\S\\s]*?)Rank:([\\S\\s]*?)\\n*Second Heading(?:[\\S\\s]*?)*\\n*Stock: ([\\S\\s]*?)Rank:([\\S\\s]*?)\\n*Source and methods\"\n",
"\n",
"matches = re.findall(pattern, text, flags = re.MULTILINE)\n",
"\n",
"\n",
"# print the matches\n",
"print(matches)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# add each group of matches to a dataframe\n",
"\n",
"df = pd.DataFrame(matches, columns = ['Country', 'stock_units', 'stock_rank', 'sold_units', 'sold_rank'])\n",
"\n",
"# add the year column where each year is 2021\n",
"df['year'] = 2021\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Country stock_units stock_rank sold_units sold_rank year\n",
"0 South Africa 110,000\\n 11 120,000\\n 22 2021\n",
"1 South Vietnam 130,000\\n 33 140,000\\n 44 2021\n",
"2 South Dakota of Denmark 130,000\\n 33 140,000\\n 44 2021\n"
]
}
],
"source": [
"# print the dataframe\n",
"print(df)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# export the dataframe as a csv file\n",
"df.to_csv('regex.csv', index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.1 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.1"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
3.5.64 South Africa
Heading Section One
Stock: 110,000
Rank: 11
Second Heading
Stock: 120,000
Rank: 22
Source and methods
3.5.64 South Vietnam
Heading Section One
Stock: 130,000
Rank: 33
Second Heading
Stock: 140,000
Rank: 44
Source and methods
3.5.6 South Dakota of Denmark
Heading Section One
Stock: 130,000
Rank: 33
Second Heading
Stock: 140,000
Rank: 44
Source and methods
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment