Skip to content

Instantly share code, notes, and snippets.

@egordorichev
Last active January 30, 2021 10:05
Show Gist options
  • Save egordorichev/f847803c9995ab58fc5ddae17846e9c0 to your computer and use it in GitHub Desktop.
Save egordorichev/f847803c9995ab58fc5ddae17846e9c0 to your computer and use it in GitHub Desktop.
Useful information from twitter
# Requires python3.6
from pattern.en import parse
from pattern.web import Twitter
from gingerit.gingerit import GingerIt
from better_profanity import profanity
import json
import string
import re
import time
parser = GingerIt()
def findUrls(string):
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url = re.findall(regex,string)
return [x[0] for x in url]
twitter = Twitter(throttle=10)
data = []
index = 0
chunkId = 0
nextTarget = 1
def stringIsAscii(s):
return all(ord(c) < 128 for c in s)
def parseText(text):
global nextTarget
tokens = parse(
text,
relations=True,
lemmata=True
).split()
for sentense in tokens:
if len(sentense) > 32: continue
if sentense[-1][2] == 'O' and sentense[-1][0] != '.': continue
convertedSentense = []
hadVerb = False
bad = False
for word in sentense:
if not hadVerb and word[4] == 'VP-1':
hadVerb = True
if word[1] != 'VBD' and word[1] != 'VBN' and word[1] != 'VBP':
bad = True
break
token = word[5]
if 'SBJ-1' in word[4]:
continue
elif not '-1' in word[4]:
token = word[0]
convertedSentense.append(token)
if not hadVerb or bad: continue
sentenseString = ''.join([('' if c in string.punctuation else ' ') + c for c in convertedSentense]).strip().capitalize()
if not stringIsAscii(sentenseString): continue
if not sentenseString[-1] in string.punctuation: sentenseString = sentenseString + '.'
print(text)
print('->', sentenseString)
correctedString = profanity.censor(parser.parse(sentenseString).get('result'))
print('=>', correctedString)
data.append(correctedString)
if len(data) >= nextTarget:
nextTarget += 16
with open('twitter2.json', 'w') as outfile:
json.dump(data, outfile, indent=2)
while True:
try:
for tweet in twitter.search('do', start=index, count=100):
if tweet.language != 'en': continue
text = tweet.text
if text.startswith('RT'): continue
if '&' in text or '@' in text or '#' in text or '\b' in text: continue
if len(findUrls(text)) > 0: continue
parseText(text)
except:
print('Sleeping...')
time.sleep(10)
continue
index += 1
time.sleep(0.1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment