Skip to content

Instantly share code, notes, and snippets.

@mizvol
Created April 21, 2017 10:05
Show Gist options
  • Save mizvol/738095cae68b3297ec5010a3848e18a2 to your computer and use it in GitHub Desktop.
Save mizvol/738095cae68b3297ec5010a3848e18a2 to your computer and use it in GitHub Desktop.
import pymongo as pm
import unicodedata
client = pm.MongoClient()
db = client.instagram
tagsDB = db.tags
tagsList = []
for tag in tagsDB.find():
tagsList.append((str(tag['_id']), [unicodedata.normalize('NFKD', t).encode('ascii','ignore')
for t in tag['tags']
if unicodedata.normalize('NFKD', t).encode('ascii','ignore') != '']))
#filter tags list - remove duplicated tags
filteredList = []
for tag in tagsList:
filteredList.append((tag[0], list(set(tag[1]))))
#create Spark Data Frame
tagsListDF = sc.parallelize(filteredList).toDF(["id", "tokens"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment