Skip to content

Instantly share code, notes, and snippets.

@domarps
Created October 23, 2018 00:28
Show Gist options
  • Save domarps/5c5b5a86fc4ad72e2eb729f55c38161c to your computer and use it in GitHub Desktop.
Save domarps/5c5b5a86fc4ad72e2eb729f55c38161c to your computer and use it in GitHub Desktop.
import re
def extract_url(cid, id32):
chop_id = '/'.join(re.findall('..', '{:0>10}'.format(cid))[0:4])
size = '240'
return (str(cid), 'https://t3.ftcdn.net/jpg/{}/{}_F_{}_{}_NW.jpg'.format(chop_id, size, cid, id32))
def extract_tags(rdd_record):
j = json.loads(rdd_record)
tags = [x.split('^')[0] for x in j['k']['eksrg']]
cid, url = extract_url(j['cid'], j['id32'])
return cid, url,tags
first_100_img_tags = first_100_rows.map(lambda record: extract_tags(record))
def toCSVLine(data):
return ','.join(str(d) for d in data)
lines = first_100_img_tags.map(toCSVLine)
lines.repartition(1).saveAsTextFile("s3://psriniva/img_cid_url_tags_100.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment