ingridstevens/regex.ipynb

## regex.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import PyPDF2 \n",
    "from PyPDF2 import PdfReader\n",
    "import os\n",
    "import re\n",
    "\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.5.64 South Africa \n",
      "Heading Section One\n",
      "Stock: 110,000\n",
      "Rank: 11\n",
      "\n",
      "Second Heading\n",
      "Stock: 120,000\n",
      "Rank: 22\n",
      "Source and methods \n",
      "\n",
      "3.5.64 South Vietnam \n",
      "\n",
      "Heading Section One \n",
      "Stock: 130,000\n",
      "Rank: 33\n",
      "\n",
      "Second Heading\n",
      "Stock: 140,000\n",
      "Rank: 44\n",
      "Source and methods \n",
      "\n",
      "3.5.6 South Dakota of Denmark \n",
      "\n",
      "Heading Section One \n",
      "Stock: 130,000\n",
      "Rank: 33\n",
      "\n",
      "Second Heading\n",
      "Stock: 140,000\n",
      "Rank: 44\n",
      "Source and methods \n"
     ]
    }
   ],
   "source": [
    "# import the file text.txt from the Developer folder\n",
    "\n",
    "# file location: /txt.txt\n",
    "\n",
    "text_path = \"regex.txt\"\n",
    "\n",
    "# load the text file\n",
    "with open(text_path, 'r') as file:\n",
    "    text = file.read()\n",
    "\n",
    "# print the text\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# NOTE: Only use if you need some text from a PDF \n",
    "\n",
    "# pdf_path = \"/path/to.pdf\"\n",
    "\n",
    "# # creating a pdf file object\n",
    "# pdfFileObject = open(pdf_path, 'rb')\n",
    "\n",
    "# pdfReader = PyPDF2.PdfFileReader(pdfFileObject)\n",
    "\n",
    "# text=''\n",
    "# for i in range(0,pdfReader.numPages):\n",
    "#     # creating a page object\n",
    "#     pageObj = pdfReader.getPage(i)\n",
    "#     # extracting text from page\n",
    "#     text=text+pageObj.extractText()\n",
    "# print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.5.64 South Africa \n",
      "Heading Section One\n",
      "Stock: 110,000\n",
      "Rank: 11\n",
      "\n",
      "Second Heading\n",
      "Stock: 120,000\n",
      "Rank: 22\n",
      "Source and methods \n",
      "\n",
      "3.5.64 South Vietnam \n",
      "\n",
      "Heading Section One \n",
      "Stock: 130,000\n",
      "Rank: 33\n",
      "\n",
      "Second Heading\n",
      "Stock: 140,000\n",
      "Rank: 44\n",
      "Source and methods \n",
      "\n",
      "3.5.6 South Dakota of Denmark \n",
      "\n",
      "Heading Section One \n",
      "Stock: 130,000\n",
      "Rank: 33\n",
      "\n",
      "Second Heading\n",
      "Stock: 140,000\n",
      "Rank: 44\n",
      "Source and methods \n"
     ]
    }
   ],
   "source": [
    "# import the file text.txt from the Developer folder\n",
    "\n",
    "# file location: /blah.txt\n",
    "\n",
    "text_path = \"regex.txt\"\n",
    "\n",
    "# load the text file\n",
    "with open(text_path, 'r') as file:\n",
    "    text = file.read()\n",
    "\n",
    "# print the text\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(' South Africa ', '110,000\\n', ' 11', '120,000\\n', ' 22'), (' South Vietnam ', '130,000\\n', ' 33', '140,000\\n', ' 44'), (' South Dakota of Denmark ', '130,000\\n', ' 33', '140,000\\n', ' 44')]\n"
     ]
    }
   ],
   "source": [
    "# apply the following regex to the text\n",
    "\n",
    "pattern = r\"3.\\d.\\d+([\\S\\s]*?)\\n*Heading Section One(?:[\\S\\s]*?)*\\n*Stock: ([\\S\\s]*?)Rank:([\\S\\s]*?)\\n*Second Heading(?:[\\S\\s]*?)*\\n*Stock: ([\\S\\s]*?)Rank:([\\S\\s]*?)\\n*Source and methods\"\n",
    "\n",
    "matches = re.findall(pattern, text, flags = re.MULTILINE)\n",
    "\n",
    "\n",
    "# print the matches\n",
    "print(matches)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add each group of matches to a dataframe\n",
    "\n",
    "df = pd.DataFrame(matches, columns = ['Country', 'stock_units', 'stock_rank', 'sold_units', 'sold_rank'])\n",
    "\n",
    "# add the year column where each year is 2021\n",
    "df['year'] = 2021\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                     Country stock_units stock_rank sold_units sold_rank  year\n",
      "0              South Africa    110,000\\n         11  120,000\\n        22  2021\n",
      "1             South Vietnam    130,000\\n         33  140,000\\n        44  2021\n",
      "2   South Dakota of Denmark    130,000\\n         33  140,000\\n        44  2021\n"
     ]
    }
   ],
   "source": [
    "# print the dataframe\n",
    "print(df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# export the dataframe as a csv file\n",
    "df.to_csv('regex.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.1 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.1"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## regex.txt
3.5.64 South Africa
Heading Section One
Stock: 110,000
Rank: 11

Second Heading
Stock: 120,000
Rank: 22
Source and methods

3.5.64 South Vietnam

Heading Section One
Stock: 130,000
Rank: 33

Second Heading
Stock: 140,000
Rank: 44
Source and methods

3.5.6 South Dakota of Denmark

Heading Section One
Stock: 130,000
Rank: 33

Second Heading
Stock: 140,000
Rank: 44
Source and methods
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"import PyPDF2 \n",
	"from PyPDF2 import PdfReader\n",
	"import os\n",
	"import re\n",
	"\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"3.5.64 South Africa \n",
	"Heading Section One\n",
	"Stock: 110,000\n",
	"Rank: 11\n",
	"\n",
	"Second Heading\n",
	"Stock: 120,000\n",
	"Rank: 22\n",
	"Source and methods \n",
	"\n",
	"3.5.64 South Vietnam \n",
	"\n",
	"Heading Section One \n",
	"Stock: 130,000\n",
	"Rank: 33\n",
	"\n",
	"Second Heading\n",
	"Stock: 140,000\n",
	"Rank: 44\n",
	"Source and methods \n",
	"\n",
	"3.5.6 South Dakota of Denmark \n",
	"\n",
	"Heading Section One \n",
	"Stock: 130,000\n",
	"Rank: 33\n",
	"\n",
	"Second Heading\n",
	"Stock: 140,000\n",
	"Rank: 44\n",
	"Source and methods \n"
	]
	}
	],
	"source": [
	"# import the file text.txt from the Developer folder\n",
	"\n",
	"# file location: /txt.txt\n",
	"\n",
	"text_path = \"regex.txt\"\n",
	"\n",
	"# load the text file\n",
	"with open(text_path, 'r') as file:\n",
	" text = file.read()\n",
	"\n",
	"# print the text\n",
	"print(text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"# NOTE: Only use if you need some text from a PDF \n",
	"\n",
	"# pdf_path = \"/path/to.pdf\"\n",
	"\n",
	"# # creating a pdf file object\n",
	"# pdfFileObject = open(pdf_path, 'rb')\n",
	"\n",
	"# pdfReader = PyPDF2.PdfFileReader(pdfFileObject)\n",
	"\n",
	"# text=''\n",
	"# for i in range(0,pdfReader.numPages):\n",
	"# # creating a page object\n",
	"# pageObj = pdfReader.getPage(i)\n",
	"# # extracting text from page\n",
	"# text=text+pageObj.extractText()\n",
	"# print(text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"3.5.64 South Africa \n",
	"Heading Section One\n",
	"Stock: 110,000\n",
	"Rank: 11\n",
	"\n",
	"Second Heading\n",
	"Stock: 120,000\n",
	"Rank: 22\n",
	"Source and methods \n",
	"\n",
	"3.5.64 South Vietnam \n",
	"\n",
	"Heading Section One \n",
	"Stock: 130,000\n",
	"Rank: 33\n",
	"\n",
	"Second Heading\n",
	"Stock: 140,000\n",
	"Rank: 44\n",
	"Source and methods \n",
	"\n",
	"3.5.6 South Dakota of Denmark \n",
	"\n",
	"Heading Section One \n",
	"Stock: 130,000\n",
	"Rank: 33\n",
	"\n",
	"Second Heading\n",
	"Stock: 140,000\n",
	"Rank: 44\n",
	"Source and methods \n"
	]
	}
	],
	"source": [
	"# import the file text.txt from the Developer folder\n",
	"\n",
	"# file location: /blah.txt\n",
	"\n",
	"text_path = \"regex.txt\"\n",
	"\n",
	"# load the text file\n",
	"with open(text_path, 'r') as file:\n",
	" text = file.read()\n",
	"\n",
	"# print the text\n",
	"print(text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[(' South Africa ', '110,000\\n', ' 11', '120,000\\n', ' 22'), (' South Vietnam ', '130,000\\n', ' 33', '140,000\\n', ' 44'), (' South Dakota of Denmark ', '130,000\\n', ' 33', '140,000\\n', ' 44')]\n"
	]
	}
	],
	"source": [
	"# apply the following regex to the text\n",
	"\n",
	"pattern = r\"3.\\d.\\d+([\\S\\s]?)\\nHeading Section One(?:[\\S\\s]?)\\nStock: ([\\S\\s]?)Rank:([\\S\\s]?)\\nSecond Heading(?:[\\S\\s]?)\\nStock: ([\\S\\s]?)Rank:([\\S\\s]?)\\nSource and methods\"\n",
	"\n",
	"matches = re.findall(pattern, text, flags = re.MULTILINE)\n",
	"\n",
	"\n",
	"# print the matches\n",
	"print(matches)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"# add each group of matches to a dataframe\n",
	"\n",
	"df = pd.DataFrame(matches, columns = ['Country', 'stock_units', 'stock_rank', 'sold_units', 'sold_rank'])\n",
	"\n",
	"# add the year column where each year is 2021\n",
	"df['year'] = 2021\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" Country stock_units stock_rank sold_units sold_rank year\n",
	"0 South Africa 110,000\\n 11 120,000\\n 22 2021\n",
	"1 South Vietnam 130,000\\n 33 140,000\\n 44 2021\n",
	"2 South Dakota of Denmark 130,000\\n 33 140,000\\n 44 2021\n"
	]
	}
	],
	"source": [
	"# print the dataframe\n",
	"print(df)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [],
	"source": [
	"# export the dataframe as a csv file\n",
	"df.to_csv('regex.csv', index = False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3.10.1 64-bit",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.1"
	},
	"orig_nbformat": 4,
	"vscode": {
	"interpreter": {
	"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	3.5.64 South Africa
	Heading Section One
	Stock: 110,000
	Rank: 11

	Second Heading
	Stock: 120,000
	Rank: 22
	Source and methods

	3.5.64 South Vietnam

	Heading Section One
	Stock: 130,000
	Rank: 33

	Second Heading
	Stock: 140,000
	Rank: 44
	Source and methods

	3.5.6 South Dakota of Denmark

	Heading Section One
	Stock: 130,000
	Rank: 33

	Second Heading
	Stock: 140,000
	Rank: 44
	Source and methods