extract text from simpsons_script_lines.csv
data_dir = './data/simpsons_script_lines.csv' | |
input_file = os.path.join(data_dir) | |
clean_text = '' | |
with open(input_file, "r", encoding="utf8") as f: | |
for line in f: | |
text = re.search('[0-9]*,[0-9]*,[0-9]*,(.+?),[0-9]*,', line) | |
if text: | |
text = text.group(1).replace('"', '') | |
text_parts = text.split(':') | |
text_parts[0] = text_parts[0].replace(' ', '_') | |
text = ':'.join(text_parts) | |
clean_text += text + '\n' | |
print('\n'.join(clean_text.split('\n')[:10])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment