Skip to content

Instantly share code, notes, and snippets.

@limsammy
Created March 2, 2019 18:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save limsammy/fa06e2bfbc50e61e00030fb86a1b57f9 to your computer and use it in GitHub Desktop.
Save limsammy/fa06e2bfbc50e61e00030fb86a1b57f9 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import re
from urllib.request import *
import urllib.request
import argparse
import requests
import os
import cv2
from imutils import paths
# ap = argparse.ArgumentParser()
# ap.add_argument("-o", "--output", required=True,
# help="path to training-set directory")
# args = vars(ap.parse_args())
# given the gun category url, group sub cateogires into classifications
# iterate over each classification's sub categories
# iterate over each article
# grab all images on article
# save to classification dir with unique name
# define classifications and their respective categories
CLASSES = {
'rifle': [
'AR Derivatives',
'Assault Rifle',
'Battle Rifle',
'Bullpup',
'Carbine',
'Rifle',
'Sniper Rifle',
'Submachine Gun',
'Machine Gun',
'Muzzleloader'
],
'handgun': [
'Flare Gun',
'Machine Pistol',
'Pistol',
'Revolver',
],
'knife': [],
'shotgun': ['Shotgun'],
'non-lethal': ['Less-Than Lethal'],
'explosive': [
'Flamethrower',
'Missile Launcher',
'Grenade',
'Grenade Launcher',
'Mine',
'Mortar',
]
}
IGNORE = ['Fictional Firearm', 'MANPADS', 'STANAG', 'UBGL', 'Underwater Firearm']
# initialize urls dictionary
urls = {}
# base url
BASE_URL = 'http://www.imfdb.org'
# open main category page
print("[INFO] navigating to url...")
main_category_html = urlopen(BASE_URL + '/wiki/Category:Gun')
# parse html w beautiful soup
print("[INFO] parsing document...")
bs = BeautifulSoup(main_category_html, 'html.parser')
# grab each sub category url
print("[INFO] finding all sub-category links...")
subcategories_div = bs.find('div', id='mw-subcategories')
subcategories = subcategories_div.find_all('a')
counter = 0
for idx, link in enumerate(subcategories, start=1):
name = link.get('title').split(':')[1]
print(name)
urls[name] = link.get('href')
print("[INFO] grabbed link #" + str(idx))
for title, url in urls.items():
if title in IGNORE:
print("[INFO] ignoring {}".format(title))
continue
sub_category_html = urlopen(BASE_URL + url)
bs = BeautifulSoup(sub_category_html, 'html.parser')
# get all articles to iterate
pages = bs.find('div', { "class": 'mw-content-ltr' })
pages = pages.find_all('a')
for page in pages:
page_html = urlopen(BASE_URL + page.get('href'))
page_bs = BeautifulSoup(page_html, 'html.parser')
images = page_bs.find_all('img', { 'src': re.compile('.jpg') })
for image in images:
image_url = BASE_URL + image['src'][:7] + image['src'][13:re.search('.jpg', image['src']).end()]
# get correct class
for c,k in CLASSES.items():
if title in k:
classification = c
# download to correct dir
try:
r = requests.get(image_url, timeout=60)
name = re.findall('(?:[^/][\d\w\.]+)$(?<=(?:.jpg))', image_url)[0]
p = os.path.sep.join(["datasets/{}".format(
classification), "{}.jpg".format(name)])
f = open(p, "wb")
f.write(r.content)
f.close()
counter += 1
print("[INFO] Downloaded image {}. So far we have scraped {} images...".format(
name, counter))
except requests.exceptions.RequestException as e:
print("[INFO] error: {}".format(e))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment