Skip to content

Instantly share code, notes, and snippets.

@tomtoump
Last active December 15, 2016 22:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tomtoump/6597732eff96dcf780fa118655e68a53 to your computer and use it in GitHub Desktop.
Save tomtoump/6597732eff96dcf780fa118655e68a53 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from bs4 import BeautifulSoup
from Queue import Queue
from threading import Thread
from urlparse import urlparse
import configargparse
import fnmatch
import os
import oss2
import requests
import sys
def _upload_file(key, data):
sys.stdout.write("Uploading '%s'" % key + '\n')
try:
bucket.put_object(key, data)
sys.stdout.write("Successfully uploaded '%s'" % key + '\n')
return True
except Exception as err:
sys.stdout.write("Error while uploading '%s': %s" % (key, err) + '\n')
def upload_files(path):
os.chdir(path)
path = os.curdir
for root, dirs, files in os.walk(path):
for file in fnmatch.filter(files, '*.html'):
# Create BeautifulSoup
file_path = os.path.join(root, file)
file_obj = open(file_path)
soup = BeautifulSoup(file_obj, 'lxml')
file_obj.close()
# Get url and name
url = soup.find(id='productImage')
name = soup.find(id='random')
if not url or not name:
print "File '%s' doesn't contain a url and name -> Skip" % file_path
continue
url = url.string.strip()
name = name.string.strip()
# Create key
path = urlparse(url).path
ext = os.path.splitext(path)[1]
key = "%s%s%s" % (options.bucket_prefix, name, ext)
final_url = 'http://%s.%s/%s' % (options.bucket, options.endpoint, key)
# Check if already processed
if url == final_url:
print "File '%s' already processed -> Skip" % file_path
continue
q.put((url, key, file_path, final_url))
def worker():
while True:
url, key, file_path, final_url = q.get()
do_work(url, key, file_path, final_url)
q.task_done()
def do_work(url, key, file_path, final_url):
# Download image
try:
image = requests.get(url).content
except requests.exceptions.RequestException as err:
sys.stdout.write("File '%s' doesn't contain a reachable url" % file_path + '\n')
return False
# Upload image
sys.stdout.write("File '%s' doesn't exist" % key + '\n')
if _upload_file(key, image):
with open(file_path, 'r') as file:
html = file.read()
html = html.decode('utf-8').replace(url, final_url).encode('utf-8')
with open(file_path, 'w') as file:
file.write(html)
if __name__ == '__main__':
p = configargparse.ArgParser()
p.add('-b', '--bucket', required=True, help='bucket name', metavar='')
p.add('-bp', '--bucket-prefix', default='', help='bucket prefix', metavar='')
p.add('-e', '--endpoint', required=True, help='api endpoint', metavar='')
p.add('-p', '--path', required=True, help='local path', metavar='')
p.add('-t', '--threads', type=int, default=10, help='number of threads', metavar='')
options = p.parse_args()
access_key = 'ACCESS_KEY'
secret_key = 'SECRET_KEY'
try:
auth = oss2.Auth(access_key, secret_key)
bucket = oss2.Bucket(auth, options.endpoint, options.bucket)
q = Queue()
for i in range(options.threads):
t = Thread(target=worker)
t.daemon = True
t.start()
upload_files(options.path)
q.join()
except Exception as err:
print err
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment