rruntsch/c_download_prep_excel.py

## c_download_prep_excel.py
"""
    File name:      c_download_prep_excel.py
    Class:          c_download_prep_excel
    Author:         Randy Runtsch
    Date:           March 27, 2021
    Description:    c_download_prep_excel is a Python class used to transcribe data from an Excel worksheet
                    into a new worksheet to prepare it for data analytics.

                    Comments in all but the constructor and get and set functions describe how the function
                    works and can be used.
"""

from openpyxl import Workbook
from openpyxl import load_workbook
from openpyxl.styles import Alignment
from openpyxl.worksheet.dimensions import ColumnDimension

import pyexcel

import requests
import os

class c_download_prep_excel:

    def __init__(self):

        self._url = None
        self._folder_name = None
        self._file_name = None

        self._workbook = None

        self._worksheet_1_name = None
        self._worsheet_1 = None

        self._worksheet_2_name = None
        self._worsheet_2 = None

    # Set and get functions

    def set_url(self, url):

        self._url = url

    def get_url(self):

        return self_url

    def set_file_name(self, file_name):

        self._file_name = file_name

    def get_file_name(self):

        return self._file_name

    def get_worksheet_name(self):

        return self._worksheet_name

    def close_workbook(self):

        # Save and close the workbook.

        self._workbook.save(self._file_name)
        self._workbook.close()

    def download_file(self):

        # Download the file from the URL to the specified file name.

        req = requests.get(self._url, )
        open(self._file_name, 'wb').write(req.content)

        # If the file extension is .xls, then convert the file to a version of an Excel
        # file with an extension of .xlsx.

        ext = os.path.splitext(self._file_name)

        # Rename and xls file to xlsx.
        if ext[1] == '.xls':

            # Save the file as an Excel xlsx file and change the self._file_name value to the xlsx file.

            new_file_name = ext[0] + '.xlsx'
            pyexcel.save_book_as(file_name = self._file_name, dest_file_name = new_file_name)
            self._file_name = new_file_name

    def open_worksheets(self, worksheet_1_name, worksheet_2_name):

        # Open the workbook and specified worksheet.

        self._workbook = load_workbook(filename = self._file_name)
        self._worksheet_1_name = worksheet_1_name
        self._worksheet_1 = self._workbook.get_sheet_by_name(worksheet_1_name)

        self._worksheet_2_name = worksheet_2_name
        self._worksheet_2 = self._workbook.create_sheet(worksheet_2_name)

    def delete_rows(self, first_row, last_row):

        # Delete range of rows.

        self._worksheet_1.delete_rows(first_row, last_row)

    def truncate_column_int_values(self, col_no, first_row_no, last_row_no, max_len):

        # Truncate the integer to the number of digits specified in max_len.

        last_row_no += 1

        for row_no in range(first_row_no, last_row_no):

            value_in = str(self._worksheet_2.cell(row = row_no, column = col_no).value)
            value_out = value_in[:max_len]
            self._worksheet_2.cell(row = row_no, column = col_no).value = int(value_out)

    def delete_col(self, col_no):

        # Delete the specified column.

        self._worksheet.delete_cols(col_no)

    def set_cell(self, cell_value, col_no, row_no):

        # Set a cell value.

        self._worksheet_2.cell(row = row_no, column = col_no, value = cell_value)

    def copy_column(self, source_col, source_first_row, source_last_row, target_col, target_first_row):

        # Copy a range of cells from worksheet 1 to the specified location in worksheet 2.

        target_row = target_first_row
        source_last_row += 1

        for source_row in range(source_first_row, source_last_row):

            target_value = self._worksheet_1.cell(row = source_row, column = source_col).value

            self._worksheet_2.cell(row = target_row, column = target_col, value = target_value)

            target_row += 1

    def align_text(self, col_no, row_no, direction):

        # Set cell text alignment.

        cell = self._worksheet_2.cell(row = row_no, column = col_no)

        cell.alignment = Alignment(horizontal = direction)

    def set_column_width(self, col_letter, width):

        # Set column width.

        self._worksheet_2.column_dimensions[col_letter].width = width

## process_fbi_data.py
"""
    File name:      process_fbi_data.py
    Author:         Randy Runtsch
    Date:           March 27, 2021
    Description:    Download the 2019 FBI violent crime report Excel file. Transcribe its
                    data to a new worksheet to prepare it for data anlytics.
"""

from c_download_prep_excel import c_download_prep_excel

download = c_download_prep_excel()

# Download the violent crimes xls file from the fbi.gov site.

download.set_url('https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/tables/table-1/table-1.xls/output.xls')
download.set_file_name('c:/project_data/fbi/crime_in_us.xls')
download.download_file()

# Create up a worksheet called Violent Crime Rate. Data will be copied from worksheet 19tbl01.
download.open_worksheets('19tbl01', 'Violent Crime Rate')

# Set column names.

download.set_cell('Year', 1, 1)
download.set_cell('Crime Type', 2, 1)
download.set_cell('Crime Rate', 3, 1)

row_start = 2

# Copy the year column and set the category types.
for source_col in range(4, 23, 2):

    # Copy the years into column 1 for each crime type.
    download.copy_column(1, 5, 24, 1, row_start)

    # Set the crime type in column 2 for each year.
    for row in range(row_start, row_start + 20):
        if source_col == 4:
            download.set_cell('Violent Crime Rate', 2, row)
        elif source_col == 6:
            download.set_cell('Murder and Non-negligent Manslaughter', 2, row)
        elif source_col == 8:
            download.set_cell('Rape - Revised Definition', 2, row)
        elif source_col == 10:
            download.set_cell('Rape - Legacy Definition', 2, row)
        elif source_col == 12:
            download.set_cell('Robbery', 2, row)
        elif source_col == 14:
            download.set_cell('Aggravated Assault', 2, row)
        elif source_col == 16:
            download.set_cell('Property', 2, row)
        elif source_col == 18:
            download.set_cell('Burglary', 2, row)
        elif source_col == 20:
            download.set_cell('Larceny Theft', 2, row)
        elif source_col == 22:
            download.set_cell('Motor Vehicle Theft', 2, row)

    row_start += 20

    # 10 crime types with 20 rows each.
    row_count = (10 * 20) + 1

# Limit year values to 4 digits by removing the superscript from some values.

download.truncate_column_int_values(1, 2, row, 4)

row_start = 2

# Set the rate values for each year and crime type in column 3.

for source_col in range(4, 23, 2):

    download.copy_column(source_col, 5, 24, 3, row_start)

    row_start += 20

# Right-justify column names for numberic columns.

download.align_text(1, 1, 'right')
download.align_text(3, 1, 'right')

# Set column widths to fit data.

download.set_column_width('A', 6)
download.set_column_width('B', 36)
download.set_column_width('C', 10)

download.close_workbook()
	"""
	File name: c_download_prep_excel.py
	Class: c_download_prep_excel
	Author: Randy Runtsch
	Date: March 27, 2021
	Description: c_download_prep_excel is a Python class used to transcribe data from an Excel worksheet
	into a new worksheet to prepare it for data analytics.

	Comments in all but the constructor and get and set functions describe how the function
	works and can be used.
	"""

	from openpyxl import Workbook
	from openpyxl import load_workbook
	from openpyxl.styles import Alignment
	from openpyxl.worksheet.dimensions import ColumnDimension

	import pyexcel

	import requests
	import os

	class c_download_prep_excel:

	def __init__(self):

	self._url = None
	self._folder_name = None
	self._file_name = None

	self._workbook = None

	self._worksheet_1_name = None
	self._worsheet_1 = None

	self._worksheet_2_name = None
	self._worsheet_2 = None

	# Set and get functions

	def set_url(self, url):

	self._url = url

	def get_url(self):

	return self_url

	def set_file_name(self, file_name):

	self._file_name = file_name

	def get_file_name(self):

	return self._file_name

	def get_worksheet_name(self):

	return self._worksheet_name

	def close_workbook(self):

	# Save and close the workbook.

	self._workbook.save(self._file_name)
	self._workbook.close()

	def download_file(self):

	# Download the file from the URL to the specified file name.

	req = requests.get(self._url, )
	open(self._file_name, 'wb').write(req.content)

	# If the file extension is .xls, then convert the file to a version of an Excel
	# file with an extension of .xlsx.

	ext = os.path.splitext(self._file_name)

	# Rename and xls file to xlsx.
	if ext[1] == '.xls':

	# Save the file as an Excel xlsx file and change the self._file_name value to the xlsx file.

	new_file_name = ext[0] + '.xlsx'
	pyexcel.save_book_as(file_name = self._file_name, dest_file_name = new_file_name)
	self._file_name = new_file_name

	def open_worksheets(self, worksheet_1_name, worksheet_2_name):

	# Open the workbook and specified worksheet.

	self._workbook = load_workbook(filename = self._file_name)
	self._worksheet_1_name = worksheet_1_name
	self._worksheet_1 = self._workbook.get_sheet_by_name(worksheet_1_name)

	self._worksheet_2_name = worksheet_2_name
	self._worksheet_2 = self._workbook.create_sheet(worksheet_2_name)

	def delete_rows(self, first_row, last_row):

	# Delete range of rows.

	self._worksheet_1.delete_rows(first_row, last_row)

	def truncate_column_int_values(self, col_no, first_row_no, last_row_no, max_len):

	# Truncate the integer to the number of digits specified in max_len.

	last_row_no += 1

	for row_no in range(first_row_no, last_row_no):

	value_in = str(self._worksheet_2.cell(row = row_no, column = col_no).value)
	value_out = value_in[:max_len]
	self._worksheet_2.cell(row = row_no, column = col_no).value = int(value_out)

	def delete_col(self, col_no):

	# Delete the specified column.

	self._worksheet.delete_cols(col_no)

	def set_cell(self, cell_value, col_no, row_no):

	# Set a cell value.

	self._worksheet_2.cell(row = row_no, column = col_no, value = cell_value)

	def copy_column(self, source_col, source_first_row, source_last_row, target_col, target_first_row):

	# Copy a range of cells from worksheet 1 to the specified location in worksheet 2.

	target_row = target_first_row
	source_last_row += 1

	for source_row in range(source_first_row, source_last_row):

	target_value = self._worksheet_1.cell(row = source_row, column = source_col).value

	self._worksheet_2.cell(row = target_row, column = target_col, value = target_value)

	target_row += 1

	def align_text(self, col_no, row_no, direction):

	# Set cell text alignment.

	cell = self._worksheet_2.cell(row = row_no, column = col_no)

	cell.alignment = Alignment(horizontal = direction)

	def set_column_width(self, col_letter, width):

	# Set column width.

	self._worksheet_2.column_dimensions[col_letter].width = width
	"""
	File name: process_fbi_data.py
	Author: Randy Runtsch
	Date: March 27, 2021
	Description: Download the 2019 FBI violent crime report Excel file. Transcribe its
	data to a new worksheet to prepare it for data anlytics.
	"""

	from c_download_prep_excel import c_download_prep_excel

	download = c_download_prep_excel()

	# Download the violent crimes xls file from the fbi.gov site.

	download.set_url('https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/tables/table-1/table-1.xls/output.xls')
	download.set_file_name('c:/project_data/fbi/crime_in_us.xls')
	download.download_file()

	# Create up a worksheet called Violent Crime Rate. Data will be copied from worksheet 19tbl01.
	download.open_worksheets('19tbl01', 'Violent Crime Rate')

	# Set column names.

	download.set_cell('Year', 1, 1)
	download.set_cell('Crime Type', 2, 1)
	download.set_cell('Crime Rate', 3, 1)

	row_start = 2

	# Copy the year column and set the category types.
	for source_col in range(4, 23, 2):

	# Copy the years into column 1 for each crime type.
	download.copy_column(1, 5, 24, 1, row_start)

	# Set the crime type in column 2 for each year.
	for row in range(row_start, row_start + 20):
	if source_col == 4:
	download.set_cell('Violent Crime Rate', 2, row)
	elif source_col == 6:
	download.set_cell('Murder and Non-negligent Manslaughter', 2, row)
	elif source_col == 8:
	download.set_cell('Rape - Revised Definition', 2, row)
	elif source_col == 10:
	download.set_cell('Rape - Legacy Definition', 2, row)
	elif source_col == 12:
	download.set_cell('Robbery', 2, row)
	elif source_col == 14:
	download.set_cell('Aggravated Assault', 2, row)
	elif source_col == 16:
	download.set_cell('Property', 2, row)
	elif source_col == 18:
	download.set_cell('Burglary', 2, row)
	elif source_col == 20:
	download.set_cell('Larceny Theft', 2, row)
	elif source_col == 22:
	download.set_cell('Motor Vehicle Theft', 2, row)

	row_start += 20

	# 10 crime types with 20 rows each.
	row_count = (10 * 20) + 1

	# Limit year values to 4 digits by removing the superscript from some values.

	download.truncate_column_int_values(1, 2, row, 4)

	row_start = 2

	# Set the rate values for each year and crime type in column 3.

	for source_col in range(4, 23, 2):

	download.copy_column(source_col, 5, 24, 3, row_start)

	row_start += 20

	# Right-justify column names for numberic columns.

	download.align_text(1, 1, 'right')
	download.align_text(3, 1, 'right')

	# Set column widths to fit data.

	download.set_column_width('A', 6)
	download.set_column_width('B', 36)
	download.set_column_width('C', 10)

	download.close_workbook()