-
-
Save tomtoump/6597732eff96dcf780fa118655e68a53 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from bs4 import BeautifulSoup | |
from Queue import Queue | |
from threading import Thread | |
from urlparse import urlparse | |
import configargparse | |
import fnmatch | |
import os | |
import oss2 | |
import requests | |
import sys | |
def _upload_file(key, data): | |
sys.stdout.write("Uploading '%s'" % key + '\n') | |
try: | |
bucket.put_object(key, data) | |
sys.stdout.write("Successfully uploaded '%s'" % key + '\n') | |
return True | |
except Exception as err: | |
sys.stdout.write("Error while uploading '%s': %s" % (key, err) + '\n') | |
def upload_files(path): | |
os.chdir(path) | |
path = os.curdir | |
for root, dirs, files in os.walk(path): | |
for file in fnmatch.filter(files, '*.html'): | |
# Create BeautifulSoup | |
file_path = os.path.join(root, file) | |
file_obj = open(file_path) | |
soup = BeautifulSoup(file_obj, 'lxml') | |
file_obj.close() | |
# Get url and name | |
url = soup.find(id='productImage') | |
name = soup.find(id='random') | |
if not url or not name: | |
print "File '%s' doesn't contain a url and name -> Skip" % file_path | |
continue | |
url = url.string.strip() | |
name = name.string.strip() | |
# Create key | |
path = urlparse(url).path | |
ext = os.path.splitext(path)[1] | |
key = "%s%s%s" % (options.bucket_prefix, name, ext) | |
final_url = 'http://%s.%s/%s' % (options.bucket, options.endpoint, key) | |
# Check if already processed | |
if url == final_url: | |
print "File '%s' already processed -> Skip" % file_path | |
continue | |
q.put((url, key, file_path, final_url)) | |
def worker(): | |
while True: | |
url, key, file_path, final_url = q.get() | |
do_work(url, key, file_path, final_url) | |
q.task_done() | |
def do_work(url, key, file_path, final_url): | |
# Download image | |
try: | |
image = requests.get(url).content | |
except requests.exceptions.RequestException as err: | |
sys.stdout.write("File '%s' doesn't contain a reachable url" % file_path + '\n') | |
return False | |
# Upload image | |
sys.stdout.write("File '%s' doesn't exist" % key + '\n') | |
if _upload_file(key, image): | |
with open(file_path, 'r') as file: | |
html = file.read() | |
html = html.decode('utf-8').replace(url, final_url).encode('utf-8') | |
with open(file_path, 'w') as file: | |
file.write(html) | |
if __name__ == '__main__': | |
p = configargparse.ArgParser() | |
p.add('-b', '--bucket', required=True, help='bucket name', metavar='') | |
p.add('-bp', '--bucket-prefix', default='', help='bucket prefix', metavar='') | |
p.add('-e', '--endpoint', required=True, help='api endpoint', metavar='') | |
p.add('-p', '--path', required=True, help='local path', metavar='') | |
p.add('-t', '--threads', type=int, default=10, help='number of threads', metavar='') | |
options = p.parse_args() | |
access_key = 'ACCESS_KEY' | |
secret_key = 'SECRET_KEY' | |
try: | |
auth = oss2.Auth(access_key, secret_key) | |
bucket = oss2.Bucket(auth, options.endpoint, options.bucket) | |
q = Queue() | |
for i in range(options.threads): | |
t = Thread(target=worker) | |
t.daemon = True | |
t.start() | |
upload_files(options.path) | |
q.join() | |
except Exception as err: | |
print err |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment