Skip to content

Instantly share code, notes, and snippets.

@kenttw
Last active April 7, 2022 05:55
Show Gist options
  • Save kenttw/a2d7e2bf1e6fcf14f532 to your computer and use it in GitHub Desktop.
Save kenttw/a2d7e2bf1e6fcf14f532 to your computer and use it in GitHub Desktop.
使用 Spark 來分析 Training 資料與待預測資料分佈狀況
genc = pickle.loads(open(settings.DATA_FOLDER + id + "/GenderClassify.pkl").read())
from urlparse import urlparse
def raw2feature(line):
r = []
try :
dictf = ["hour" , "category_id" , "cookie_pta" , "timestamp" , "url" , "country" , "city" , "resolution" , "browser" , "browser_version" , "os" , "os_version" , "device_model" , "device_marketing" , "device_brand" , "search_keyword" , "referrer_host"]
parsedline = dict()
index = 0
sps = line.strip().split(',')
for key in dictf :
parsedline[key] = sps[index]
index = index + 1
url = parsedline['url']
parsed = urlparse(url)
r.append(("author_" + parsed.netloc,1))
r.append(("article_" + parsed.path,1))
r.append(("hour_" + parsedline['hour'],1))
r.append(("cat_" + parsedline['category_id'],1))
r.append(("refer_" + parsedline['referrer_host'],1))
r.append(("country_" + parsedline['country'],1))
except :
pass
return r
fpath = settings.DATA_FOLDER + id +'/predict/*.csv???'
lines = sc.textFile(fpath.replace("s3", "s3n"))
pairs = lines.flatMap (raw2feature)
result = pairs.reduceByKey(lambda a, b: a + b)
allmap = result.collectAsMap()
fset =genc.vecz.feature_names_
interskeys = set(allmap.keys()) & set(fset)
print "Training 資料中有 " + str(len(fset)) + " feature:"
print "待預測資料中有 " + str(len(allmap)) + " feature"
print "交集中有 " + str(len(interskeys)) + " feature"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment