mcky/scrape.py

## scrape.py
from scrapeMeta import scrapeMeta
import os
import json
from bs4 import BeautifulSoup
import urllib

def getHTML(url):
    html = urllib.urlopen(url).read()
    return BeautifulSoup(html)

def scrapeMeta(url):
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html)
    head = soup.head

    ogtags = ['title', 'description', 'site_name', 'url', 'image',
              'image:secure_url', 'image:type', 'video',
              'video:secure_url', 'video:type']

    headObj = {}
    headObj['url'] = url
    headObj['title'] = head.title.string
    headObj['description'] = head.find(attrs={"name":"description"})['content']
    headObj['opengraph'] = {}

    for tag in ogtags:
        formattedTag = head.find(property='og:{0}'.format(tag))
        if formattedTag is not None:
            property = formattedTag['property']
            content = formattedTag['content']
            headObj['opengraph'][property] = content
        else:
            tagName = 'og:{0}'.format(tag)
            headObj['opengraph'][tagName] = ''

    return headObj

def saveMeta(urls):
    outFile = open('meta.json','w')
    json.dump(urls, outFile, indent=4)
    outFile.close()

file = './urls.txt'
headers = []
urls = [url.rstrip('\n') for url in open(file)]

for url in urls:
    output = scrapeMeta(url)
    headers.append(output)

saveMeta(headers)


## urls.txt
http://localhost:3000
http://localhost:3000/about/
http://localhost:3000/terms/
http://localhost:3000/privacy/
http://localhost:3000/goals/no-poverty
http://localhost:3000/goals/quality-education/
http://localhost:3000/s/696/
http://localhost:3000/s/700/
	from scrapeMeta import scrapeMeta
	import os
	import json
	from bs4 import BeautifulSoup
	import urllib

	def getHTML(url):
	html = urllib.urlopen(url).read()
	return BeautifulSoup(html)

	def scrapeMeta(url):
	html = urllib.urlopen(url).read()
	soup = BeautifulSoup(html)
	head = soup.head

	ogtags = ['title', 'description', 'site_name', 'url', 'image',
	'image:secure_url', 'image:type', 'video',
	'video:secure_url', 'video:type']

	headObj = {}
	headObj['url'] = url
	headObj['title'] = head.title.string
	headObj['description'] = head.find(attrs={"name":"description"})['content']
	headObj['opengraph'] = {}

	for tag in ogtags:
	formattedTag = head.find(property='og:{0}'.format(tag))
	if formattedTag is not None:
	property = formattedTag['property']
	content = formattedTag['content']
	headObj['opengraph'][property] = content
	else:
	tagName = 'og:{0}'.format(tag)
	headObj['opengraph'][tagName] = ''

	return headObj

	def saveMeta(urls):
	outFile = open('meta.json','w')
	json.dump(urls, outFile, indent=4)
	outFile.close()

	file = './urls.txt'
	headers = []
	urls = [url.rstrip('\n') for url in open(file)]

	for url in urls:
	output = scrapeMeta(url)
	headers.append(output)

	saveMeta(headers)
	http://localhost:3000
	http://localhost:3000/about/
	http://localhost:3000/terms/
	http://localhost:3000/privacy/
	http://localhost:3000/goals/no-poverty
	http://localhost:3000/goals/quality-education/
	http://localhost:3000/s/696/
	http://localhost:3000/s/700/