-
-
Save brianchesley/9598044 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup | |
import urllib2 | |
import urllib | |
import re | |
def get_links(): | |
final_list = [] | |
links_list = [] | |
all_news_links = [] | |
news_links = [] | |
url = "http://apps.carleton.edu/news/news/" | |
carleton_news = urllib2.urlopen(url) | |
html = carleton_news.read() | |
soup = BeautifulSoup(html) | |
all_links = soup.find_all('a') | |
for links in all_links: | |
links_list.append(links.get('href')) | |
for links in links_list: | |
try: | |
if links.find("?") == 0 and links.find("s") == 1: #Wanted to use a regular expression but could not make it work | |
all_news_links.append(links) | |
except: | |
pass | |
for i in all_news_links: | |
if i not in news_links: | |
news_links.append(i) | |
final_news_list = [url + x for x in news_links] | |
for i in final_news_list: | |
final_list.append([i]) | |
return final_list | |
def get_titles(news_list): | |
titles = [] | |
for links in news_list: | |
for link in links: | |
carleton_news = urllib2.urlopen(link).read() | |
soup = BeautifulSoup(carleton_news) | |
titles.append(soup.title.string) | |
return titles | |
def append_titles(news_list,titles): | |
for links in range(len(titles)): | |
for link in range(1): | |
news_list[links].append((titles[links])) | |
news_list[links].append(links) | |
return news_list | |
def get_text(url): | |
carleton_news = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(carleton_news) | |
all_paragraphs = soup.find("div", class_="text").text | |
return all_paragraphs | |
def run(news_list): | |
while True: | |
print "here's the latest from Carleton News: " | |
for links in range(len(news_list)): | |
print "article number " + str(news_list[links][2] + 1) | |
print news_list[links][1] | |
article_num = input("which article would you like to read? ") | |
print news_list[article_num-1][1] | |
print get_text(news_list[article_num-1][0]) | |
more_articles = input("would you like to read more articles? Press 1 if yes ") | |
if more_articles != 1: | |
break | |
run(append_titles(get_links(),get_titles(get_links()))) |
brianchesley
commented
Mar 17, 2014
- Had questions about the conditional in the get_links function--I wanted to use a regular expression, but for some reason I could not get it to work. I couldn't figure out if regular expressions are compatible with unicode/beautiful soup or if my expression was just wrong.
- I had a lot of difficulty manipulating the unicode/whatever beautiful soup returns. I wanted to delete the first part of the title that reads "Carleton College: Carleton News: News" and make spacing changes to the article text but couldn't. Is there an easy work around to delete parts of the text? I tried find/index etc to no avail.
Have you used a repl before? Try getting beautiful soup objects in a REPL (like bpython) and seeing what methods they have, or try reading the docs at http://www.crummy.com/software/BeautifulSoup/bs4/doc/
I find https://www.debuggex.com/ really useful for debugging Python regular expressions - be sure to change the language on the dropdown from JavaScript to Python.
I mention a REPL because that makes answering questions like "are beautiful soup objects compatible with regex" pretty easy. (turns out they're not, you'll need to use regex with the text you get out of a beautiful soup object)
I misunderstood you, looks like you totally can - see http://www.crummy.com/software/BeautifulSoup/bs4/doc/#a-regular-expression
My comments would mostly be to break this up into more functions, in order to give names to the things you're doing at each step. I'd make url a parameter instead of hardcoding it. In lines 30/31, a "set" is would better