Skip to content

Instantly share code, notes, and snippets.

@mcky
Created August 28, 2015 10:07
Show Gist options
  • Save mcky/6ad8feddc6514ed0a5c8 to your computer and use it in GitHub Desktop.
Save mcky/6ad8feddc6514ed0a5c8 to your computer and use it in GitHub Desktop.
from scrapeMeta import scrapeMeta
import os
import json
from bs4 import BeautifulSoup
import urllib
def getHTML(url):
html = urllib.urlopen(url).read()
return BeautifulSoup(html)
def scrapeMeta(url):
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
head = soup.head
ogtags = ['title', 'description', 'site_name', 'url', 'image',
'image:secure_url', 'image:type', 'video',
'video:secure_url', 'video:type']
headObj = {}
headObj['url'] = url
headObj['title'] = head.title.string
headObj['description'] = head.find(attrs={"name":"description"})['content']
headObj['opengraph'] = {}
for tag in ogtags:
formattedTag = head.find(property='og:{0}'.format(tag))
if formattedTag is not None:
property = formattedTag['property']
content = formattedTag['content']
headObj['opengraph'][property] = content
else:
tagName = 'og:{0}'.format(tag)
headObj['opengraph'][tagName] = ''
return headObj
def saveMeta(urls):
outFile = open('meta.json','w')
json.dump(urls, outFile, indent=4)
outFile.close()
file = './urls.txt'
headers = []
urls = [url.rstrip('\n') for url in open(file)]
for url in urls:
output = scrapeMeta(url)
headers.append(output)
saveMeta(headers)
http://localhost:3000
http://localhost:3000/about/
http://localhost:3000/terms/
http://localhost:3000/privacy/
http://localhost:3000/goals/no-poverty
http://localhost:3000/goals/quality-education/
http://localhost:3000/s/696/
http://localhost:3000/s/700/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment