TutorialDoctor/student.py

## student.py
import urllib,pprint
from bs4 import BeautifulSoup

def train(url):
	html=urllib.urlopen(url).read()
	soup = BeautifulSoup(html)
	# kill all script and style elements
	for script in soup(["script", "style"]):
		script.extract()    # rip it out
	text = soup.get_text()
	# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	dic={}
	#dic=set()
	y=text.split('.')
	#print(y)
	for i in y:
		if ' is ' in i:
			#dic[i.split('is')[0].strip()]=i.split('is')[1].strip() #
			dic.update({i.split('is')[0].strip():i.split(' is ')[1].strip()}) #unique?
	pprint.PrettyPrinter(depth=2).pprint(dic)
	print('COMPLETED TRAINING')

train("https://en.m.wikipedia.org/wiki/Telugu_language")


"""
# PYTHONISTA VERSION
import urllib.request,pprint
from bs4 import BeautifulSoup

def train(url):
	html=urllib.request.urlopen(url).read()
	soup = BeautifulSoup(html,'html5lib')
	# kill all script and style elements
	for script in soup(["script", "style"]):
		script.extract()    # rip it out
	text = soup.get_text()
	# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	dic={}
	#dic=set()
	y=text.split('.')
	#print(y)
	for i in y:
		if ' is ' in i:
			#dic[i.split('is')[0].strip()]=i.split('is')[1].strip() #
			dic.update({i.split('is')[0].strip():i.split(' is ')[1].strip()}) #unique?
	pprint.PrettyPrinter(depth=2).pprint(dic)
	print('COMPLETED TRAINING')

train("https://en.m.wikipedia.org/wiki/Telugu_language")
"""
	import urllib,pprint
	from bs4 import BeautifulSoup

	def train(url):
	html=urllib.urlopen(url).read()
	soup = BeautifulSoup(html)
	# kill all script and style elements
	for script in soup(["script", "style"]):
	script.extract() # rip it out
	text = soup.get_text()
	# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	dic={}
	#dic=set()
	y=text.split('.')
	#print(y)
	for i in y:
	if ' is ' in i:
	#dic[i.split('is')[0].strip()]=i.split('is')[1].strip() #
	dic.update({i.split('is')[0].strip():i.split(' is ')[1].strip()}) #unique?
	pprint.PrettyPrinter(depth=2).pprint(dic)
	print('COMPLETED TRAINING')

	train("https://en.m.wikipedia.org/wiki/Telugu_language")


	"""
	# PYTHONISTA VERSION
	import urllib.request,pprint
	from bs4 import BeautifulSoup

	def train(url):
	html=urllib.request.urlopen(url).read()
	soup = BeautifulSoup(html,'html5lib')
	# kill all script and style elements
	for script in soup(["script", "style"]):
	script.extract() # rip it out
	text = soup.get_text()
	# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	dic={}
	#dic=set()
	y=text.split('.')
	#print(y)
	for i in y:
	if ' is ' in i:
	#dic[i.split('is')[0].strip()]=i.split('is')[1].strip() #
	dic.update({i.split('is')[0].strip():i.split(' is ')[1].strip()}) #unique?
	pprint.PrettyPrinter(depth=2).pprint(dic)
	print('COMPLETED TRAINING')

	train("https://en.m.wikipedia.org/wiki/Telugu_language")
	"""