Skip to content

Instantly share code, notes, and snippets.

@pamelafox
Last active August 31, 2020 11:23
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pamelafox/020683c814e71c8262534563fcc7363f to your computer and use it in GitHub Desktop.
Save pamelafox/020683c814e71c8262534563fcc7363f to your computer and use it in GitHub Desktop.
Scrapers
from bs4 import BeautifulSoup
import requests
import string
def save_to_file(filename, sayings):
file = open(filename, "w")
file.write("\n".join(sayings).encode("utf-8"))
file.close()
def scrape_pickuplinesnet():
sayings = []
for num in range(1, 72):
r = requests.get("https://pickup-lines.net/page/" + str(num) + "/")
soup = BeautifulSoup(r.text, "html.parser")
for list_item in soup.select('article.loop-entry .loop-entry-line'):
saying = list_item.get_text()
sayings.append(saying)
save_to_file("pickuplinesnet.txt", sayings)
def lower_and_strip(s):
return ''.join(c for c in s.lower() if c in string.ascii_lowercase)
def combine_sayings():
filenames = ['sayings_wikipedia.txt']
outfile = open('sayings.txt', "w")
unique_sayings = {}
for fname in filenames:
with open(fname) as infile:
for line in infile:
unique_sayings[lower_and_strip(line)] = True
outfile.write(line)
print len(unique_sayings.keys())
scrape_pickuplinesnet()
from bs4 import BeautifulSoup
import requests
import string
def save_to_file(filename, sayings):
file = open(filename, "w")
file.write("\n".join(sayings).encode("utf-8"))
file.close()
def scrape_wikipedia():
r = requests.get("https://en.wikiquote.org/wiki/English_proverbs")
soup = BeautifulSoup(r.text)
sayings = []
for list_item in soup.find_all('li'):
saying = list_item.get_text()
if saying.startswith('"'):
saying = saying.split('"')[1]
sayings.append(saying)
save_to_file("sayings_wikipedia.txt", sayings)
def scrape_truisms():
r = requests.get("http://1001truisms.webs.com/truisms.htm")
soup = BeautifulSoup(r.text)
sayings = []
for item in soup.select('p > span:nth-of-type(2)'):
saying = item.get_text()
sayings.append(saying)
save_to_file("sayings_truisms.txt", sayings)
def scrape_phrasesuk():
r = requests.get("http://www.phrases.org.uk/meanings/proverbs.html")
soup = BeautifulSoup(r.text)
sayings = []
for item in soup.select('p.phrase-list'):
saying = item.get_text()
print saying
sayings.append(saying)
save_to_file("sayings_phrasesuk.txt", sayings)
def scrape_twwproverbs():
r = requests.get("http://tww.id.au/proverbs/proverbs.html")
soup = BeautifulSoup(r.text)
sayings = []
for item in soup.select('ul li'):
saying = item.get_text()
print saying
sayings.append(saying)
save_to_file("sayings_twwproverbs.txt", sayings)
def lower_and_strip(s):
return ''.join(c for c in s.lower() if c in string.ascii_lowercase)
def combine_sayings():
filenames = ['sayings_wikipedia.txt', 'sayings_truisms.txt', 'sayings_phrasesuk.txt', 'sayings_twwproverbs.txt']
outfile = open('sayings.txt', "w")
unique_sayings = {}
for fname in filenames:
with open(fname) as infile:
for line in infile:
unique_sayings[lower_and_strip(line)] = True
outfile.write(line)
print len(unique_sayings.keys())
combine_sayings()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment