Skip to content

Instantly share code, notes, and snippets.

@hassanvfx
Created July 10, 2023 14:15
Show Gist options
  • Save hassanvfx/b445fbdcabe29957670aff4cd34cc9dd to your computer and use it in GitHub Desktop.
Save hassanvfx/b445fbdcabe29957670aff4cd34cc9dd to your computer and use it in GitHub Desktop.
This script extract only the text from the twitter scraper
import csv
import sys
import os
import re
def remove_urls(text):
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
return url_pattern.sub(r'', text)
def read_csv(filename, output_filename):
with open(filename, 'r') as csvfile:
reader = csv.DictReader(csvfile)
with open(output_filename, 'w') as txtfile:
for row in reader:
content = row['Content']
if content is None:
content = ''
else:
content = remove_urls(content)
txtfile.write(content + '\n')
def main():
if len(sys.argv) != 2:
print(f'Usage: {sys.argv[0]} input_filename')
sys.exit(1)
input_filename = sys.argv[1]
basename = os.path.splitext(input_filename)[0]
output_filename = basename + '.txt'
read_csv(input_filename, output_filename)
if __name__ == "__main__":
main()
@hassanvfx
Copy link
Author

run this like

python output_text.py twitter_account.csv

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment