Last active
March 3, 2022 16:46
-
-
Save rruntsch/2095f8dbd9a77e59c06bb1ba3e31b194 to your computer and use it in GitHub Desktop.
Python class c_download_prep_excel and controller program to download Excel workbooks from websites and prep them for analytics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
File name: c_download_prep_excel.py | |
Class: c_download_prep_excel | |
Author: Randy Runtsch | |
Date: March 27, 2021 | |
Description: c_download_prep_excel is a Python class used to transcribe data from an Excel worksheet | |
into a new worksheet to prepare it for data analytics. | |
Comments in all but the constructor and get and set functions describe how the function | |
works and can be used. | |
""" | |
from openpyxl import Workbook | |
from openpyxl import load_workbook | |
from openpyxl.styles import Alignment | |
from openpyxl.worksheet.dimensions import ColumnDimension | |
import pyexcel | |
import requests | |
import os | |
class c_download_prep_excel: | |
def __init__(self): | |
self._url = None | |
self._folder_name = None | |
self._file_name = None | |
self._workbook = None | |
self._worksheet_1_name = None | |
self._worsheet_1 = None | |
self._worksheet_2_name = None | |
self._worsheet_2 = None | |
# Set and get functions | |
def set_url(self, url): | |
self._url = url | |
def get_url(self): | |
return self_url | |
def set_file_name(self, file_name): | |
self._file_name = file_name | |
def get_file_name(self): | |
return self._file_name | |
def get_worksheet_name(self): | |
return self._worksheet_name | |
def close_workbook(self): | |
# Save and close the workbook. | |
self._workbook.save(self._file_name) | |
self._workbook.close() | |
def download_file(self): | |
# Download the file from the URL to the specified file name. | |
req = requests.get(self._url, ) | |
open(self._file_name, 'wb').write(req.content) | |
# If the file extension is .xls, then convert the file to a version of an Excel | |
# file with an extension of .xlsx. | |
ext = os.path.splitext(self._file_name) | |
# Rename and xls file to xlsx. | |
if ext[1] == '.xls': | |
# Save the file as an Excel xlsx file and change the self._file_name value to the xlsx file. | |
new_file_name = ext[0] + '.xlsx' | |
pyexcel.save_book_as(file_name = self._file_name, dest_file_name = new_file_name) | |
self._file_name = new_file_name | |
def open_worksheets(self, worksheet_1_name, worksheet_2_name): | |
# Open the workbook and specified worksheet. | |
self._workbook = load_workbook(filename = self._file_name) | |
self._worksheet_1_name = worksheet_1_name | |
self._worksheet_1 = self._workbook.get_sheet_by_name(worksheet_1_name) | |
self._worksheet_2_name = worksheet_2_name | |
self._worksheet_2 = self._workbook.create_sheet(worksheet_2_name) | |
def delete_rows(self, first_row, last_row): | |
# Delete range of rows. | |
self._worksheet_1.delete_rows(first_row, last_row) | |
def truncate_column_int_values(self, col_no, first_row_no, last_row_no, max_len): | |
# Truncate the integer to the number of digits specified in max_len. | |
last_row_no += 1 | |
for row_no in range(first_row_no, last_row_no): | |
value_in = str(self._worksheet_2.cell(row = row_no, column = col_no).value) | |
value_out = value_in[:max_len] | |
self._worksheet_2.cell(row = row_no, column = col_no).value = int(value_out) | |
def delete_col(self, col_no): | |
# Delete the specified column. | |
self._worksheet.delete_cols(col_no) | |
def set_cell(self, cell_value, col_no, row_no): | |
# Set a cell value. | |
self._worksheet_2.cell(row = row_no, column = col_no, value = cell_value) | |
def copy_column(self, source_col, source_first_row, source_last_row, target_col, target_first_row): | |
# Copy a range of cells from worksheet 1 to the specified location in worksheet 2. | |
target_row = target_first_row | |
source_last_row += 1 | |
for source_row in range(source_first_row, source_last_row): | |
target_value = self._worksheet_1.cell(row = source_row, column = source_col).value | |
self._worksheet_2.cell(row = target_row, column = target_col, value = target_value) | |
target_row += 1 | |
def align_text(self, col_no, row_no, direction): | |
# Set cell text alignment. | |
cell = self._worksheet_2.cell(row = row_no, column = col_no) | |
cell.alignment = Alignment(horizontal = direction) | |
def set_column_width(self, col_letter, width): | |
# Set column width. | |
self._worksheet_2.column_dimensions[col_letter].width = width |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
File name: process_fbi_data.py | |
Author: Randy Runtsch | |
Date: March 27, 2021 | |
Description: Download the 2019 FBI violent crime report Excel file. Transcribe its | |
data to a new worksheet to prepare it for data anlytics. | |
""" | |
from c_download_prep_excel import c_download_prep_excel | |
download = c_download_prep_excel() | |
# Download the violent crimes xls file from the fbi.gov site. | |
download.set_url('https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/tables/table-1/table-1.xls/output.xls') | |
download.set_file_name('c:/project_data/fbi/crime_in_us.xls') | |
download.download_file() | |
# Create up a worksheet called Violent Crime Rate. Data will be copied from worksheet 19tbl01. | |
download.open_worksheets('19tbl01', 'Violent Crime Rate') | |
# Set column names. | |
download.set_cell('Year', 1, 1) | |
download.set_cell('Crime Type', 2, 1) | |
download.set_cell('Crime Rate', 3, 1) | |
row_start = 2 | |
# Copy the year column and set the category types. | |
for source_col in range(4, 23, 2): | |
# Copy the years into column 1 for each crime type. | |
download.copy_column(1, 5, 24, 1, row_start) | |
# Set the crime type in column 2 for each year. | |
for row in range(row_start, row_start + 20): | |
if source_col == 4: | |
download.set_cell('Violent Crime Rate', 2, row) | |
elif source_col == 6: | |
download.set_cell('Murder and Non-negligent Manslaughter', 2, row) | |
elif source_col == 8: | |
download.set_cell('Rape - Revised Definition', 2, row) | |
elif source_col == 10: | |
download.set_cell('Rape - Legacy Definition', 2, row) | |
elif source_col == 12: | |
download.set_cell('Robbery', 2, row) | |
elif source_col == 14: | |
download.set_cell('Aggravated Assault', 2, row) | |
elif source_col == 16: | |
download.set_cell('Property', 2, row) | |
elif source_col == 18: | |
download.set_cell('Burglary', 2, row) | |
elif source_col == 20: | |
download.set_cell('Larceny Theft', 2, row) | |
elif source_col == 22: | |
download.set_cell('Motor Vehicle Theft', 2, row) | |
row_start += 20 | |
# 10 crime types with 20 rows each. | |
row_count = (10 * 20) + 1 | |
# Limit year values to 4 digits by removing the superscript from some values. | |
download.truncate_column_int_values(1, 2, row, 4) | |
row_start = 2 | |
# Set the rate values for each year and crime type in column 3. | |
for source_col in range(4, 23, 2): | |
download.copy_column(source_col, 5, 24, 3, row_start) | |
row_start += 20 | |
# Right-justify column names for numberic columns. | |
download.align_text(1, 1, 'right') | |
download.align_text(3, 1, 'right') | |
# Set column widths to fit data. | |
download.set_column_width('A', 6) | |
download.set_column_width('B', 36) | |
download.set_column_width('C', 10) | |
download.close_workbook() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment