egordorichev/scaper.py

## scaper.py
# Requires python3.6

from pattern.en import parse
from pattern.web import Twitter
from gingerit.gingerit import GingerIt
from better_profanity import profanity

import json
import string
import re
import time

parser = GingerIt()

def findUrls(string):
	regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
	url = re.findall(regex,string)
	return [x[0] for x in url]

twitter = Twitter(throttle=10)

data = []
index = 0
chunkId = 0
nextTarget = 1

def stringIsAscii(s):
	return all(ord(c) < 128 for c in s)

def parseText(text):
	global nextTarget

	tokens = parse(
		text,
		relations=True,
		lemmata=True
	).split()

	for sentense in tokens:
		if len(sentense) > 32: continue
		if sentense[-1][2] == 'O' and sentense[-1][0] != '.': continue

		convertedSentense = []
		hadVerb = False
		bad = False

		for word in sentense:
			if not hadVerb and word[4] == 'VP-1':
				hadVerb = True

				if word[1] != 'VBD' and word[1] != 'VBN' and word[1] != 'VBP':
					bad = True
					break

			token = word[5]

			if 'SBJ-1' in word[4]:
				continue
			elif not '-1' in word[4]:
				token = word[0]

			convertedSentense.append(token)

		if not hadVerb or bad: continue

		sentenseString = ''.join([('' if c in string.punctuation else ' ') + c for c in convertedSentense]).strip().capitalize()

		if not stringIsAscii(sentenseString): continue
		if not sentenseString[-1] in string.punctuation: sentenseString = sentenseString + '.'

		print(text)
		print('->', sentenseString)

		correctedString = profanity.censor(parser.parse(sentenseString).get('result'))

		print('=>', correctedString)
		data.append(correctedString)

		if len(data) >= nextTarget:
			nextTarget += 16

			with open('twitter2.json', 'w') as outfile:
					json.dump(data, outfile, indent=2)

while True:
	try:
		for tweet in twitter.search('do', start=index, count=100):
			if tweet.language != 'en': continue
			text = tweet.text

			if text.startswith('RT'): continue
			if '&' in text or '@' in text or '#' in text or '\b' in text: continue
			if len(findUrls(text)) > 0: continue

			parseText(text)
	except:
		print('Sleeping...')
		time.sleep(10)
		continue

	index += 1
	time.sleep(0.1)
	# Requires python3.6

	from pattern.en import parse
	from pattern.web import Twitter
	from gingerit.gingerit import GingerIt
	from better_profanity import profanity

	import json
	import string
	import re
	import time

	parser = GingerIt()

	def findUrls(string):
	regex = r"(?i)\b((?:https?://\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
	url = re.findall(regex,string)
	return [x[0] for x in url]

	twitter = Twitter(throttle=10)

	data = []
	index = 0
	chunkId = 0
	nextTarget = 1

	def stringIsAscii(s):
	return all(ord(c) < 128 for c in s)

	def parseText(text):
	global nextTarget

	tokens = parse(
	text,
	relations=True,
	lemmata=True
	).split()

	for sentense in tokens:
	if len(sentense) > 32: continue
	if sentense[-1][2] == 'O' and sentense[-1][0] != '.': continue

	convertedSentense = []
	hadVerb = False
	bad = False

	for word in sentense:
	if not hadVerb and word[4] == 'VP-1':
	hadVerb = True

	if word[1] != 'VBD' and word[1] != 'VBN' and word[1] != 'VBP':
	bad = True
	break

	token = word[5]

	if 'SBJ-1' in word[4]:
	continue
	elif not '-1' in word[4]:
	token = word[0]

	convertedSentense.append(token)

	if not hadVerb or bad: continue

	sentenseString = ''.join([('' if c in string.punctuation else ' ') + c for c in convertedSentense]).strip().capitalize()

	if not stringIsAscii(sentenseString): continue
	if not sentenseString[-1] in string.punctuation: sentenseString = sentenseString + '.'

	print(text)
	print('->', sentenseString)

	correctedString = profanity.censor(parser.parse(sentenseString).get('result'))

	print('=>', correctedString)
	data.append(correctedString)

	if len(data) >= nextTarget:
	nextTarget += 16

	with open('twitter2.json', 'w') as outfile:
	json.dump(data, outfile, indent=2)

	while True:
	try:
	for tweet in twitter.search('do', start=index, count=100):
	if tweet.language != 'en': continue
	text = tweet.text

	if text.startswith('RT'): continue
	if '&' in text or '@' in text or '#' in text or '\b' in text: continue
	if len(findUrls(text)) > 0: continue

	parseText(text)
	except:
	print('Sleeping...')
	time.sleep(10)
	continue

	index += 1
	time.sleep(0.1)