JFriel/scraper

## scraper
import urllib
from bs4 import BeautifulSoup
import sys
import nltk
from nltk import word_tokenize

def splitParagraphIntoSentences(paragraph):
    ''' break a paragraph into sentences
        and return a list '''
    import re
    # to split by multile characters

    #   regular expressions are easiest (and fastest)
    sentenceEnders = re.compile('[.!?]')
    sentenceList = sentenceEnders.split(paragraph)
    return sentenceList

url = "https://en.wikipedia.org/wiki/Special:Random"
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)

# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip it out

# get text
text = soup.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

#Up to here it just give a pure scrape, stored in 'text'

title = []
for i in range(0,50):
	if(text[i] != '-'):
		title.append(text[i])
	else:
		break
fulltitle = ''.join(title).encode('utf-8')
print fulltitle


tokenisedWiki = splitParagraphIntoSentences(text)
if(tokenisedWiki[1] == " Please improve this article by introducing more precise citations" or tokenisedWiki[1] == " Please help to improve the section, or discuss the issue on the talk page"):
    print tokenisedWiki[2]
else:
    print tokenisedWiki[1]
	import urllib
	from bs4 import BeautifulSoup
	import sys
	import nltk
	from nltk import word_tokenize

	def splitParagraphIntoSentences(paragraph):
	''' break a paragraph into sentences
	and return a list '''
	import re
	# to split by multile characters

	# regular expressions are easiest (and fastest)
	sentenceEnders = re.compile('[.!?]')
	sentenceList = sentenceEnders.split(paragraph)
	return sentenceList

	url = "https://en.wikipedia.org/wiki/Special:Random"
	html = urllib.urlopen(url).read()
	soup = BeautifulSoup(html)

	# kill all script and style elements
	for script in soup(["script", "style"]):
	script.extract() # rip it out

	# get text
	text = soup.get_text()

	# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	#Up to here it just give a pure scrape, stored in 'text'

	title = []
	for i in range(0,50):
	if(text[i] != '-'):
	title.append(text[i])
	else:
	break
	fulltitle = ''.join(title).encode('utf-8')
	print fulltitle


	tokenisedWiki = splitParagraphIntoSentences(text)
	if(tokenisedWiki[1] == " Please improve this article by introducing more precise citations" or tokenisedWiki[1] == " Please help to improve the section, or discuss the issue on the talk page"):
	print tokenisedWiki[2]
	else:
	print tokenisedWiki[1]