y2k-shubham/hash_script.py

## hash_script.py
import hashlib
import os
import sys


# /////////////////////////
# file-read / write methods
# /////////////////////////

def get_utf8_file_descriptor(file_path, mode):
    """
    depending on Python version (2 / 3), returns a file
    descriptor of passed mode (read / write) with UTF-8
    (non-ASCII characters) encoding support
    ..
    :param file_path: complete qualified file path + name
    :type file_path: str
    :param mode: file opening mode (like "r" for read). see available modes:
                 https://www.programiz.com/python-programming/file-operation
    :type mode: str
    :return: A file descriptor for operations (read / write / append etc)
    """
    if sys.version_info[0] >= 3:
        return open(file_path, mode=mode, encoding="utf-8")
    else:
        import io
        return io.open(file_path, mode=mode, encoding="utf-8")


def read_lines(file_path):
    """
    reads a file line-by-line and returns a list of strings
    (each line represented by an element of list)
    ..
    :param file_path: complete qualified file path + name
    :type file_path: str
    :return: list of strings, where each item is a line of file
    :type: List[str]
    """
    with get_utf8_file_descriptor(file_path=file_path, mode="r") as input_file:
        lines = input_file.read().splitlines()
        return lines


def write_lines(file_path, lines):
    """
    writes a list of strings item-by-item to different lines of
    a UTF-8 compatible text file
    :param file_path: complete qualified output file-path + name
    :type file_path: str
    :param lines: list of strings to be written to file
    :type lines: List[str]
    :return: None
    """
    with get_utf8_file_descriptor(file_path=file_path, mode="wt") as output_file:
        output_file.write("\n".join(lines))


# /////////////////////////
# string manipulation / hashing methods
# /////////////////////////

def is_enclosed_by(word, punctuation="\""):
    """
    Determines whether or not the given word is enclosed
    by given puncuation character (also a string)
    ..
    :param word: string to be checked for enclosure
    :type word: str
    :param punctuation: puncuation character
    :type punctuation: str (single-character-string)
    :return: Boolean denoting whether or not string is enclosed
    :type: bool
    """
    begins_with_punctuation = word[0] == punctuation
    ends_with_punctuation = word[-1] == punctuation
    return (begins_with_punctuation and ends_with_punctuation)


def hash_word(word):
    """
    Returns SHA-128 (SHA1) hash digest of a string
    :param word: string to be hashed
    :type word: str
    :return: SHA-128 hash digest of the passed string
    :type: str
    """
    return hashlib.sha1(word.encode()).hexdigest()


def hash_lines(lines, column_positions_to_hash, contains_column_headers):
    """
    Accepts a list of string (lines) and a list of positions (ints)
    - Assumes each item of lines is a row of CSV
    - Splits each line of lines by comma ','
    - Hashes (SHA-128) those tokens (words) of line whose position is
      specified by column_positions_to_hash
    - rebuilds and returns the list of strings with new (hashed) data
    ..
    :param lines: list of strings (lines) of a CSV file
    :type: lines: List[str]
    :param column_positions_to_hash: list of column positions (1-indexed) to be hashed
                                     in each item (line) of lines
    :type column_positions_to_hash: List[int]
    :param contains_column_headers: whether or not input list of lines includes
                                    (first line) as column headers
    :type contains_column_headers: bool
    :return: list of strings formed by hashing words at specified positions
    :type: List[str]
    """
    hashed_lines = [None] * len(lines)
    for i in range(len(lines)):
        line = lines[i]
        line_tokens = line.split(",")
        hashed_line_tokens = line_tokens[:]
        for j in column_positions_to_hash:
            adjusted_column_posision_to_hash = j - 1
            token = line_tokens[adjusted_column_posision_to_hash]
            token_is_puncuated = is_enclosed_by(token)
            hashed_token = hash_word(token.strip("\""))
            punctuated_hashed_token = "\"{}\"".format(hashed_token) if token_is_puncuated else hashed_token
            hashed_line_tokens[adjusted_column_posision_to_hash] = punctuated_hashed_token
        hashed_line = ",".join(hashed_line_tokens)
        hashed_lines[i] = hashed_line
    if contains_column_headers:
        hashed_lines[0] = lines[0]
    return hashed_lines


# /////////////////////////
# main method
# /////////////////////////

def main():
    # directory containing input / output files (fully qualified absolute path)
    base_dir = "/Users/compadmin/Downloads"
    # input file name (with extension)
    input_file_name = "sample.csv"
    # output file name = "input_file_name.csv.sha1"
    output_file_name = input_file_name + ".sha1"
    # (1-indexed) column positions to be hashed in each row
    column_positions_to_hash = [3, 4]
    # whether or not input file contains column headers
    contains_column_headers = True

    lines = read_lines(file_path=os.path.join(base_dir, input_file_name))
    hashed_lines = hash_lines(lines=lines,
                              column_positions_to_hash=column_positions_to_hash,
                              contains_column_headers=contains_column_headers)
    write_lines(file_path=os.path.join(base_dir, output_file_name), lines=hashed_lines)


if __name__ == "__main__":
    main()
	import hashlib
	import os
	import sys


	# /////////////////////////
	# file-read / write methods
	# /////////////////////////

	def get_utf8_file_descriptor(file_path, mode):
	"""
	depending on Python version (2 / 3), returns a file
	descriptor of passed mode (read / write) with UTF-8
	(non-ASCII characters) encoding support
	..
	:param file_path: complete qualified file path + name
	:type file_path: str
	:param mode: file opening mode (like "r" for read). see available modes:
	https://www.programiz.com/python-programming/file-operation
	:type mode: str
	:return: A file descriptor for operations (read / write / append etc)
	"""
	if sys.version_info[0] >= 3:
	return open(file_path, mode=mode, encoding="utf-8")
	else:
	import io
	return io.open(file_path, mode=mode, encoding="utf-8")


	def read_lines(file_path):
	"""
	reads a file line-by-line and returns a list of strings
	(each line represented by an element of list)
	..
	:param file_path: complete qualified file path + name
	:type file_path: str
	:return: list of strings, where each item is a line of file
	:type: List[str]
	"""
	with get_utf8_file_descriptor(file_path=file_path, mode="r") as input_file:
	lines = input_file.read().splitlines()
	return lines


	def write_lines(file_path, lines):
	"""
	writes a list of strings item-by-item to different lines of
	a UTF-8 compatible text file
	:param file_path: complete qualified output file-path + name
	:type file_path: str
	:param lines: list of strings to be written to file
	:type lines: List[str]
	:return: None
	"""
	with get_utf8_file_descriptor(file_path=file_path, mode="wt") as output_file:
	output_file.write("\n".join(lines))


	# /////////////////////////
	# string manipulation / hashing methods
	# /////////////////////////

	def is_enclosed_by(word, punctuation="\""):
	"""
	Determines whether or not the given word is enclosed
	by given puncuation character (also a string)
	..
	:param word: string to be checked for enclosure
	:type word: str
	:param punctuation: puncuation character
	:type punctuation: str (single-character-string)
	:return: Boolean denoting whether or not string is enclosed
	:type: bool
	"""
	begins_with_punctuation = word[0] == punctuation
	ends_with_punctuation = word[-1] == punctuation
	return (begins_with_punctuation and ends_with_punctuation)


	def hash_word(word):
	"""
	Returns SHA-128 (SHA1) hash digest of a string
	:param word: string to be hashed
	:type word: str
	:return: SHA-128 hash digest of the passed string
	:type: str
	"""
	return hashlib.sha1(word.encode()).hexdigest()


	def hash_lines(lines, column_positions_to_hash, contains_column_headers):
	"""
	Accepts a list of string (lines) and a list of positions (ints)
	- Assumes each item of lines is a row of CSV
	- Splits each line of lines by comma ','
	- Hashes (SHA-128) those tokens (words) of line whose position is
	specified by column_positions_to_hash
	- rebuilds and returns the list of strings with new (hashed) data
	..
	:param lines: list of strings (lines) of a CSV file
	:type: lines: List[str]
	:param column_positions_to_hash: list of column positions (1-indexed) to be hashed
	in each item (line) of lines
	:type column_positions_to_hash: List[int]
	:param contains_column_headers: whether or not input list of lines includes
	(first line) as column headers
	:type contains_column_headers: bool
	:return: list of strings formed by hashing words at specified positions
	:type: List[str]
	"""
	hashed_lines = [None] * len(lines)
	for i in range(len(lines)):
	line = lines[i]
	line_tokens = line.split(",")
	hashed_line_tokens = line_tokens[:]
	for j in column_positions_to_hash:
	adjusted_column_posision_to_hash = j - 1
	token = line_tokens[adjusted_column_posision_to_hash]
	token_is_puncuated = is_enclosed_by(token)
	hashed_token = hash_word(token.strip("\""))
	punctuated_hashed_token = "\"{}\"".format(hashed_token) if token_is_puncuated else hashed_token
	hashed_line_tokens[adjusted_column_posision_to_hash] = punctuated_hashed_token
	hashed_line = ",".join(hashed_line_tokens)
	hashed_lines[i] = hashed_line
	if contains_column_headers:
	hashed_lines[0] = lines[0]
	return hashed_lines


	# /////////////////////////
	# main method
	# /////////////////////////

	def main():
	# directory containing input / output files (fully qualified absolute path)
	base_dir = "/Users/compadmin/Downloads"
	# input file name (with extension)
	input_file_name = "sample.csv"
	# output file name = "input_file_name.csv.sha1"
	output_file_name = input_file_name + ".sha1"
	# (1-indexed) column positions to be hashed in each row
	column_positions_to_hash = [3, 4]
	# whether or not input file contains column headers
	contains_column_headers = True

	lines = read_lines(file_path=os.path.join(base_dir, input_file_name))
	hashed_lines = hash_lines(lines=lines,
	column_positions_to_hash=column_positions_to_hash,
	contains_column_headers=contains_column_headers)
	write_lines(file_path=os.path.join(base_dir, output_file_name), lines=hashed_lines)


	if __name__ == "__main__":
	main()