Skip to content

Instantly share code, notes, and snippets.

@vatsalsaglani
Last active August 5, 2018 18:58
Show Gist options
  • Save vatsalsaglani/5bf6a622e3fe6ecaf4af53edbf7f6be7 to your computer and use it in GitHub Desktop.
Save vatsalsaglani/5bf6a622e3fe6ecaf4af53edbf7f6be7 to your computer and use it in GitHub Desktop.
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
import pandas as pd
from urllib.parse import urlparse, urlsplit
from PIL import Image
website_page = 'http://nameofyourwebsite.com'
page = urlopen(website_page)
soup = BS(page)
soup
title_links = soup.find_all('h3', class_='entry-title')
title_links
links = {}
for i in range(0, len(title_links)):
_title = title_links[i]
only_title = _title.text.strip()
#print(_title.text.strip())
_link = [n['href'] for n in _title]
only_link = _link[0]
webpage_2 = only_link
print(webpage_2)
open_webpage_2 = urlopen(webpage_2)
soup2 = BS(open_webpage_2)
ex = soup2.find_all(['p', 'h4'])
post_Text = ''
for j in ex:
post_Text += j.text.strip()
#print(_link[0])
links[i]=[only_title, only_link, post_Text]
links
A = []
B = []
C = []
for i in range(0, len(links)):
A.append(links[i][0])
B.append(links[i][1])
C.append(links[i][2])
df = pd.DataFrame(A, columns=['Title'])
df['Link']=B
df['Post']=C
df.to_csv('file_name.csv', sep='\t', encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment