- Python 3
- Pip 3
$ brew install python3
# http://docs.wand-py.org/en/0.5.9/ | |
# http://www.imagemagick.org/script/formats.php | |
# brew install freetype imagemagick | |
# brew install PIL | |
# brew install tesseract | |
# pip3 install wand | |
# pip3 install pyocr | |
import pyocr.builders | |
import requests | |
from io import BytesIO |
gz_buffer = BytesIO() | |
json_buffer = StringIO() | |
download_url = "{0}{1}/file".format(request_url, file_id) | |
request_download = requests.request("GET", download_url, headers=json_header, stream=True) | |
with zipfile.ZipFile(BytesIO(request_download.content), mode='r') as z: | |
unzip_file = StringIO(z.read(z.infolist()[0]).decode('utf-8')) | |
json_responses = json.load(unzip_file)['responses'] | |
for response in json_responses: | |
json_buffer.write(json.dumps(response)) |
# List Comprehension: | |
process_dict = dict([(attributes.filename, attributes.st_size) for attributes in file_list if attributes.filename.startswith('solcon')]) | |
# Whitespace Generous: | |
for attributes in file_list: | |
if attributes.filename.startswith('solcon'): | |
process_dict[attributes.filename] = attributes.st_size |
SET hive.execution.engine = mr; | |
SET hive.support.concurrency = false; | |
SET hive.exec.parallel = true; | |
SET hive.exec.dynamic.partition.mode=nonstrict; | |
USE hosting_stats; | |
WITH Rank AS ( | |
SELECT | |
cid |
WITH Landing AS ( | |
SELECT | |
visit_id | |
,COLLECT_SET(shopper_id) AS shopper_array | |
,MIN(sequence) AS min_sequence | |
FROM | |
visits | |
WHERE | |
page_type = 'landing' | |
GROUP BY |
CHECK_HDFS="/some/path/to/file" | |
function hdfsCheck { | |
RETRY=0 | |
while [ $RETRY -lt 9 ]; | |
do | |
COUNT=$(hdfs dfs -ls "${CHECK_HDFS}" | wc -l) 2> stderr.txt | |
if [ $COUNT -lt 1 ]; then |
if [ $(date +"%-H") -ge 4 ] && [ $(date +"%-H") -le 17 ]; then | |
sleep $(((17 - $(date +"%-H")) * 60))m | |
else | |
sleep 5m | |
fi |