Skip to content

Instantly share code, notes, and snippets.

@shrubb
Created May 28, 2017 16:08
Show Gist options
  • Save shrubb/f2a973c5ea8e8101308a76a699464a5a to your computer and use it in GitHub Desktop.
Save shrubb/f2a973c5ea8e8101308a76a699464a5a to your computer and use it in GitHub Desktop.
Butyrka csv-to-txt preprocessing
# encoding: utf-8
import sys
import csv
import re
def preprocess_text(text):
text = text.replace('\t', '')
text = text.replace('ё' , 'е')
text = text.replace('…' , '...')
for c in '–—–':
text = text.replace(c, '-')
for c in '‘“”`«»':
text = text.replace(c, '"')
text = re.sub('.*[Пп]рипев.*' , '', text)
text = re.sub('.*[Пп]овтор.*' , '', text)
text = re.sub('/.*' , '' , text) # всякая хрень типа " // 2 раза"
text = re.sub('\n +' , '\n' , text) # убрать пробелы в начале строки
text = re.sub(' +\n' , '\n' , text) # убрать пробелы в конце строки
text = re.sub(' +' , ' ' , text) # только один пробел подряд
text = re.sub('\n\n+' , '\n\n', text) # не более двух переносов строки подряд
text = re.sub(',[^ \n]', ', ' , text) # всегда пробел после запятой
# дефис как тире всегда между пробелов
text = re.sub('[ \n]-[^ \n]' , ' - ' , text)
text = re.sub('[^ \n]-[ \n]' , ' - ' , text)
# сделать заглавной каждую первую букву в строке
for k in range(len(text)-1):
if text[k].isalpha() and text[k].islower() and (k == 0 or text[k-1] == '\n'):
text = text[:k] + text[k].upper() + text[k+1:]
return text
if len(sys.argv) != 3:
print('Usage: python3 csv_to_txt.py chanson.csv chanson.txt')
exit()
csv_file_path = sys.argv[1]
txt_file_path = sys.argv[2]
import random
songs = []
with open(csv_file_path, 'r') as csv_file:
csv_reader = csv.reader(csv_file, quoting=csv.QUOTE_ALL)
next(csv_reader) # skip header
with open(txt_file_path, 'w') as txt_file:
csv_lines = list(csv_reader)
random.shuffle(csv_lines)
for line in csv_lines:
_, author, title, text, labels = line
song = '======\n'
song += 'Автор: {}\n'.format(author)
song += 'Название: {}\n'.format(title)
if labels:
for label in labels.split(','):
song += '* {}\n'.format(label.replace('/', ', '))
song += '======\n\n'
song += preprocess_text(text)
song += '\n\n'
txt_file.write(song)
songs.append(song)
# import numpy as np
# np.save('chanson.npy', np.array(songs))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment