Skip to content

Instantly share code, notes, and snippets.

@tomers
Created January 5, 2023 16:10
Show Gist options
  • Save tomers/32e49c06321cb6673940b886ddf3dec7 to your computer and use it in GitHub Desktop.
Save tomers/32e49c06321cb6673940b886ddf3dec7 to your computer and use it in GitHub Desktop.
Haifa Vadis XLSX to CSV file converter
#!/usr/bin/env python3
import csv
import re
from itertools import islice
from pathlib import Path
import openpyxl
import phonenumbers
import validators
def convert_xlsx(xlsx: Path, prefix, csv_all):
print(f"Converting {xlsx}")
# read Excel file
workbook = openpyxl.load_workbook(filename=xlsx)
sheet = workbook.active
# delete unwanted columns
sheet.delete_cols(idx=4, amount=2)
# rename columns
sheet['A1'] = 'First Name'
sheet['B1'] = 'Mobile Phone'
sheet['C1'] = 'E-mail Address'
# add prefix to name
for row in range(2, sheet.max_row + 1):
cell = f'A{row}'
name = sheet[cell].value
sheet[cell] = f'{str(prefix) * 3} {name}'
# normalize phone numbers
for row in range(2, sheet.max_row + 1):
cell = f'B{row}'
value = str(sheet[cell].value)
phone = None
try:
phone = phonenumbers.parse(value, 'IL')
phone = phonenumbers.format_number(
phone, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
except phonenumbers.phonenumberutil.NumberParseException as e:
print(f"Failed parsing cell {cell} with value {value}: {e}")
sheet[cell] = str(phone)
# normalize emails
for row in range(2, sheet.max_row + 1):
cell = f'C{row}'
value = str(sheet[cell].value)
sheet[cell] = value if validators.email(value) else None
# normalize cells
for column in range(1, sheet.max_column + 1):
for row in range(2, sheet.max_row + 1):
cell = sheet.cell(row=row, column=column)
cell.value = re.sub(r'\s+', ' ', str(cell.value).strip())
# store CSV file
csv_output = Path(str(xlsx) + '.csv')
with open(csv_output, 'w', newline='') as f:
c = csv.writer(f)
for r in islice(sheet.rows, 1, None):
c.writerow([cell.value for cell in r])
# append to global CSV file
c = csv.writer(csv_all)
for r in islice(sheet.rows, 1, None):
c.writerow([cell.value for cell in r])
def main():
input_files = list(Path.cwd().glob('*.xlsx'))
with open('all.csv', 'w', newline='') as f:
for i, xlsx in enumerate(input_files):
convert_xlsx(xlsx, i, f)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment