Skip to content

Instantly share code, notes, and snippets.

@y2k-shubham
Created February 27, 2020 05:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save y2k-shubham/3b4d3d5c0163866e038128f6a34d32ac to your computer and use it in GitHub Desktop.
Save y2k-shubham/3b4d3d5c0163866e038128f6a34d32ac to your computer and use it in GitHub Desktop.
import hashlib
import os
import sys
# /////////////////////////
# file-read / write methods
# /////////////////////////
def get_utf8_file_descriptor(file_path, mode):
"""
depending on Python version (2 / 3), returns a file
descriptor of passed mode (read / write) with UTF-8
(non-ASCII characters) encoding support
..
:param file_path: complete qualified file path + name
:type file_path: str
:param mode: file opening mode (like "r" for read). see available modes:
https://www.programiz.com/python-programming/file-operation
:type mode: str
:return: A file descriptor for operations (read / write / append etc)
"""
if sys.version_info[0] >= 3:
return open(file_path, mode=mode, encoding="utf-8")
else:
import io
return io.open(file_path, mode=mode, encoding="utf-8")
def read_lines(file_path):
"""
reads a file line-by-line and returns a list of strings
(each line represented by an element of list)
..
:param file_path: complete qualified file path + name
:type file_path: str
:return: list of strings, where each item is a line of file
:type: List[str]
"""
with get_utf8_file_descriptor(file_path=file_path, mode="r") as input_file:
lines = input_file.read().splitlines()
return lines
def write_lines(file_path, lines):
"""
writes a list of strings item-by-item to different lines of
a UTF-8 compatible text file
:param file_path: complete qualified output file-path + name
:type file_path: str
:param lines: list of strings to be written to file
:type lines: List[str]
:return: None
"""
with get_utf8_file_descriptor(file_path=file_path, mode="wt") as output_file:
output_file.write("\n".join(lines))
# /////////////////////////
# string manipulation / hashing methods
# /////////////////////////
def is_enclosed_by(word, punctuation="\""):
"""
Determines whether or not the given word is enclosed
by given puncuation character (also a string)
..
:param word: string to be checked for enclosure
:type word: str
:param punctuation: puncuation character
:type punctuation: str (single-character-string)
:return: Boolean denoting whether or not string is enclosed
:type: bool
"""
begins_with_punctuation = word[0] == punctuation
ends_with_punctuation = word[-1] == punctuation
return (begins_with_punctuation and ends_with_punctuation)
def hash_word(word):
"""
Returns SHA-128 (SHA1) hash digest of a string
:param word: string to be hashed
:type word: str
:return: SHA-128 hash digest of the passed string
:type: str
"""
return hashlib.sha1(word.encode()).hexdigest()
def hash_lines(lines, column_positions_to_hash, contains_column_headers):
"""
Accepts a list of string (lines) and a list of positions (ints)
- Assumes each item of lines is a row of CSV
- Splits each line of lines by comma ','
- Hashes (SHA-128) those tokens (words) of line whose position is
specified by column_positions_to_hash
- rebuilds and returns the list of strings with new (hashed) data
..
:param lines: list of strings (lines) of a CSV file
:type: lines: List[str]
:param column_positions_to_hash: list of column positions (1-indexed) to be hashed
in each item (line) of lines
:type column_positions_to_hash: List[int]
:param contains_column_headers: whether or not input list of lines includes
(first line) as column headers
:type contains_column_headers: bool
:return: list of strings formed by hashing words at specified positions
:type: List[str]
"""
hashed_lines = [None] * len(lines)
for i in range(len(lines)):
line = lines[i]
line_tokens = line.split(",")
hashed_line_tokens = line_tokens[:]
for j in column_positions_to_hash:
adjusted_column_posision_to_hash = j - 1
token = line_tokens[adjusted_column_posision_to_hash]
token_is_puncuated = is_enclosed_by(token)
hashed_token = hash_word(token.strip("\""))
punctuated_hashed_token = "\"{}\"".format(hashed_token) if token_is_puncuated else hashed_token
hashed_line_tokens[adjusted_column_posision_to_hash] = punctuated_hashed_token
hashed_line = ",".join(hashed_line_tokens)
hashed_lines[i] = hashed_line
if contains_column_headers:
hashed_lines[0] = lines[0]
return hashed_lines
# /////////////////////////
# main method
# /////////////////////////
def main():
# directory containing input / output files (fully qualified absolute path)
base_dir = "/Users/compadmin/Downloads"
# input file name (with extension)
input_file_name = "sample.csv"
# output file name = "input_file_name.csv.sha1"
output_file_name = input_file_name + ".sha1"
# (1-indexed) column positions to be hashed in each row
column_positions_to_hash = [3, 4]
# whether or not input file contains column headers
contains_column_headers = True
lines = read_lines(file_path=os.path.join(base_dir, input_file_name))
hashed_lines = hash_lines(lines=lines,
column_positions_to_hash=column_positions_to_hash,
contains_column_headers=contains_column_headers)
write_lines(file_path=os.path.join(base_dir, output_file_name), lines=hashed_lines)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment