Skip to content

Instantly share code, notes, and snippets.

@pjha1994
Created July 25, 2015 15:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pjha1994/f662ea5b49f128333426 to your computer and use it in GitHub Desktop.
Save pjha1994/f662ea5b49f128333426 to your computer and use it in GitHub Desktop.
import re
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import os
import httplib2
c=0
def make_soup(s):
match=re.compile('https://|http://|www.|.com|.in|.org|gov.in')
if re.search(match,s):
http = httplib2.Http()
status, response = http.request(s)
#parse_only=SoupStrainer(['a','time'])
page = BeautifulSoup(response)
return page
else:
return None
def is_a_valid_link(href):
match1=re.compile('http://|https://')
match2=re.compile('/r/news/comments/')
match3=re.compile('modpost')
def is_a_valid_link(href):
match1=re.compile('http://|https://')
match2=re.compile('/r/news/comments/')
match3=re.compile('modpost')
return re.search(match1,href) and re.search(match2,href) and not re.search(match3,href)
def parse(s):
global c
flag=0
soup=make_soup(s)
match4=re.compile('comments')
if(soup!=None):
#for tag in soup.select('a.may-blank loggedin'):
for link in soup.find_all('a',attrs={'class':['title',' may-blank', 'loggedin']}):
#if(link['class']!=['author may-blank loggedin']):
#if(not re.search(re.compile('/r/WritingPrompts/comments/'),link['href'])):
#f.write('\nPROMPT '+str(c+1)+'\n')
#f.write(link.string+'\n')
#f.write(link['href']+'\n')
x=soup.link.find_all('time',class_='live-timestamp',datetime=True,title=True)
print(x)
def read_reddit_images():
s='https://www.reddit.com/r/news/'
soup=make_soup(s)
parse(s)
read_reddit_images()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment