Skip to content

Instantly share code, notes, and snippets.

@drrobotnik
Created February 7, 2019 19:39
Show Gist options
  • Save drrobotnik/2fb14929428ba30deff79f30cba14fe7 to your computer and use it in GitHub Desktop.
Save drrobotnik/2fb14929428ba30deff79f30cba14fe7 to your computer and use it in GitHub Desktop.
from collections import Counter
import argparse
import fnmatch
import os
import sys
from bs4 import BeautifulSoup
matches = []
elements = {}
parser = argparse.ArgumentParser(description='scrape files and parse doms')
parser.add_argument('--path', help='path of files')
parser.add_argument('--element', help='dom element type')
args = parser.parse_args()
for root, dirnames, filenames in os.walk(args.path):
for filename in fnmatch.filter(filenames, '*.html'):
matches.append(os.path.join(root, filename))
def walker(soup):
if soup.name is not None:
for child in soup.children:
childName = str(child.name)
if childName != 'None':
if args.element:
if childName not in elements:
elements[childName] = [child]
else:
elements[childName].append(child)
else:
elements.append(child)
walker(child)
for file in matches:
data = open(file)
soup = BeautifulSoup(data, 'lxml')
walker(soup)
elementsByType = elements
if args.element:
elementsByType = elements[args.element]
storelen = len(elementsByType)
nodupes = list(set(elementsByType))
#mostFrequent = sorted(elementsByType, key=Counter(elementsByType).get, reverse=True)
#sorted = sorted(nodupes, key=len)
mostCommon = Counter(elementsByType).most_common()
for commonElement in mostCommon:
print "=============================="
print "Element occurs:", commonElement[1], " times"
print "=============================="
print commonElement[0]
print "=============================="
#for element in sorted:
#for element in mostFrequent:
# print element
@drrobotnik
Copy link
Author

mkdir $(date +%Y%m%d) && cd $(date +%Y%m%d)

wget --recursive \
--no-clobber \
--page-requisites \
--restrict-file-names=unix \
--exclude-directories="/path/to/exclude,/another/path/to/exclude" \
--no-check-certificate \
--domains domain.to.parse.com \
--no-parent https://domain.to.parse.com

python soup.py --path="../../spider/20180709/domain.to.parse.com/" --element="div"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment