drrobotnik/soup.py

## soup.py
from collections import Counter
import argparse
import fnmatch
import os
import sys
from bs4 import BeautifulSoup

matches = []
elements = {}

parser = argparse.ArgumentParser(description='scrape files and parse doms')

parser.add_argument('--path', help='path of files')
parser.add_argument('--element', help='dom element type')

args = parser.parse_args()

for root, dirnames, filenames in os.walk(args.path):
    for filename in fnmatch.filter(filenames, '*.html'):
        matches.append(os.path.join(root, filename))

def walker(soup):
    if soup.name is not None:
        for child in soup.children:
            childName = str(child.name)
            if childName != 'None':
                if args.element:
                    if childName not in elements:
                        elements[childName] = [child]
                    else:
                        elements[childName].append(child)
                else:
                    elements.append(child)
                walker(child)

for file in matches:
    data = open(file)
    soup = BeautifulSoup(data, 'lxml')
    walker(soup)


elementsByType = elements

if args.element:
    elementsByType = elements[args.element]


storelen = len(elementsByType)
nodupes = list(set(elementsByType))

#mostFrequent = sorted(elementsByType, key=Counter(elementsByType).get, reverse=True)
#sorted = sorted(nodupes, key=len)

mostCommon = Counter(elementsByType).most_common()

for commonElement in mostCommon:
    print "=============================="
    print "Element occurs:", commonElement[1], " times"
    print "=============================="
    print commonElement[0]
    print "=============================="

#for element in sorted:
#for element in mostFrequent:
#    print element
	from collections import Counter
	import argparse
	import fnmatch
	import os
	import sys
	from bs4 import BeautifulSoup

	matches = []
	elements = {}

	parser = argparse.ArgumentParser(description='scrape files and parse doms')

	parser.add_argument('--path', help='path of files')
	parser.add_argument('--element', help='dom element type')

	args = parser.parse_args()

	for root, dirnames, filenames in os.walk(args.path):
	for filename in fnmatch.filter(filenames, '*.html'):
	matches.append(os.path.join(root, filename))

	def walker(soup):
	if soup.name is not None:
	for child in soup.children:
	childName = str(child.name)
	if childName != 'None':
	if args.element:
	if childName not in elements:
	elements[childName] = [child]
	else:
	elements[childName].append(child)
	else:
	elements.append(child)
	walker(child)

	for file in matches:
	data = open(file)
	soup = BeautifulSoup(data, 'lxml')
	walker(soup)


	elementsByType = elements

	if args.element:
	elementsByType = elements[args.element]


	storelen = len(elementsByType)
	nodupes = list(set(elementsByType))

	#mostFrequent = sorted(elementsByType, key=Counter(elementsByType).get, reverse=True)
	#sorted = sorted(nodupes, key=len)

	mostCommon = Counter(elementsByType).most_common()

	for commonElement in mostCommon:
	print "=============================="
	print "Element occurs:", commonElement[1], " times"
	print "=============================="
	print commonElement[0]
	print "=============================="

	#for element in sorted:
	#for element in mostFrequent:
	# print element