Skip to content

Instantly share code, notes, and snippets.

@rruntsch
Last active March 3, 2022 16:46
Show Gist options
  • Save rruntsch/2095f8dbd9a77e59c06bb1ba3e31b194 to your computer and use it in GitHub Desktop.
Save rruntsch/2095f8dbd9a77e59c06bb1ba3e31b194 to your computer and use it in GitHub Desktop.
Python class c_download_prep_excel and controller program to download Excel workbooks from websites and prep them for analytics
"""
File name: c_download_prep_excel.py
Class: c_download_prep_excel
Author: Randy Runtsch
Date: March 27, 2021
Description: c_download_prep_excel is a Python class used to transcribe data from an Excel worksheet
into a new worksheet to prepare it for data analytics.
Comments in all but the constructor and get and set functions describe how the function
works and can be used.
"""
from openpyxl import Workbook
from openpyxl import load_workbook
from openpyxl.styles import Alignment
from openpyxl.worksheet.dimensions import ColumnDimension
import pyexcel
import requests
import os
class c_download_prep_excel:
def __init__(self):
self._url = None
self._folder_name = None
self._file_name = None
self._workbook = None
self._worksheet_1_name = None
self._worsheet_1 = None
self._worksheet_2_name = None
self._worsheet_2 = None
# Set and get functions
def set_url(self, url):
self._url = url
def get_url(self):
return self_url
def set_file_name(self, file_name):
self._file_name = file_name
def get_file_name(self):
return self._file_name
def get_worksheet_name(self):
return self._worksheet_name
def close_workbook(self):
# Save and close the workbook.
self._workbook.save(self._file_name)
self._workbook.close()
def download_file(self):
# Download the file from the URL to the specified file name.
req = requests.get(self._url, )
open(self._file_name, 'wb').write(req.content)
# If the file extension is .xls, then convert the file to a version of an Excel
# file with an extension of .xlsx.
ext = os.path.splitext(self._file_name)
# Rename and xls file to xlsx.
if ext[1] == '.xls':
# Save the file as an Excel xlsx file and change the self._file_name value to the xlsx file.
new_file_name = ext[0] + '.xlsx'
pyexcel.save_book_as(file_name = self._file_name, dest_file_name = new_file_name)
self._file_name = new_file_name
def open_worksheets(self, worksheet_1_name, worksheet_2_name):
# Open the workbook and specified worksheet.
self._workbook = load_workbook(filename = self._file_name)
self._worksheet_1_name = worksheet_1_name
self._worksheet_1 = self._workbook.get_sheet_by_name(worksheet_1_name)
self._worksheet_2_name = worksheet_2_name
self._worksheet_2 = self._workbook.create_sheet(worksheet_2_name)
def delete_rows(self, first_row, last_row):
# Delete range of rows.
self._worksheet_1.delete_rows(first_row, last_row)
def truncate_column_int_values(self, col_no, first_row_no, last_row_no, max_len):
# Truncate the integer to the number of digits specified in max_len.
last_row_no += 1
for row_no in range(first_row_no, last_row_no):
value_in = str(self._worksheet_2.cell(row = row_no, column = col_no).value)
value_out = value_in[:max_len]
self._worksheet_2.cell(row = row_no, column = col_no).value = int(value_out)
def delete_col(self, col_no):
# Delete the specified column.
self._worksheet.delete_cols(col_no)
def set_cell(self, cell_value, col_no, row_no):
# Set a cell value.
self._worksheet_2.cell(row = row_no, column = col_no, value = cell_value)
def copy_column(self, source_col, source_first_row, source_last_row, target_col, target_first_row):
# Copy a range of cells from worksheet 1 to the specified location in worksheet 2.
target_row = target_first_row
source_last_row += 1
for source_row in range(source_first_row, source_last_row):
target_value = self._worksheet_1.cell(row = source_row, column = source_col).value
self._worksheet_2.cell(row = target_row, column = target_col, value = target_value)
target_row += 1
def align_text(self, col_no, row_no, direction):
# Set cell text alignment.
cell = self._worksheet_2.cell(row = row_no, column = col_no)
cell.alignment = Alignment(horizontal = direction)
def set_column_width(self, col_letter, width):
# Set column width.
self._worksheet_2.column_dimensions[col_letter].width = width
"""
File name: process_fbi_data.py
Author: Randy Runtsch
Date: March 27, 2021
Description: Download the 2019 FBI violent crime report Excel file. Transcribe its
data to a new worksheet to prepare it for data anlytics.
"""
from c_download_prep_excel import c_download_prep_excel
download = c_download_prep_excel()
# Download the violent crimes xls file from the fbi.gov site.
download.set_url('https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/tables/table-1/table-1.xls/output.xls')
download.set_file_name('c:/project_data/fbi/crime_in_us.xls')
download.download_file()
# Create up a worksheet called Violent Crime Rate. Data will be copied from worksheet 19tbl01.
download.open_worksheets('19tbl01', 'Violent Crime Rate')
# Set column names.
download.set_cell('Year', 1, 1)
download.set_cell('Crime Type', 2, 1)
download.set_cell('Crime Rate', 3, 1)
row_start = 2
# Copy the year column and set the category types.
for source_col in range(4, 23, 2):
# Copy the years into column 1 for each crime type.
download.copy_column(1, 5, 24, 1, row_start)
# Set the crime type in column 2 for each year.
for row in range(row_start, row_start + 20):
if source_col == 4:
download.set_cell('Violent Crime Rate', 2, row)
elif source_col == 6:
download.set_cell('Murder and Non-negligent Manslaughter', 2, row)
elif source_col == 8:
download.set_cell('Rape - Revised Definition', 2, row)
elif source_col == 10:
download.set_cell('Rape - Legacy Definition', 2, row)
elif source_col == 12:
download.set_cell('Robbery', 2, row)
elif source_col == 14:
download.set_cell('Aggravated Assault', 2, row)
elif source_col == 16:
download.set_cell('Property', 2, row)
elif source_col == 18:
download.set_cell('Burglary', 2, row)
elif source_col == 20:
download.set_cell('Larceny Theft', 2, row)
elif source_col == 22:
download.set_cell('Motor Vehicle Theft', 2, row)
row_start += 20
# 10 crime types with 20 rows each.
row_count = (10 * 20) + 1
# Limit year values to 4 digits by removing the superscript from some values.
download.truncate_column_int_values(1, 2, row, 4)
row_start = 2
# Set the rate values for each year and crime type in column 3.
for source_col in range(4, 23, 2):
download.copy_column(source_col, 5, 24, 3, row_start)
row_start += 20
# Right-justify column names for numberic columns.
download.align_text(1, 1, 'right')
download.align_text(3, 1, 'right')
# Set column widths to fit data.
download.set_column_width('A', 6)
download.set_column_width('B', 36)
download.set_column_width('C', 10)
download.close_workbook()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment