Skip to content

Instantly share code, notes, and snippets.

@hn5092
Created April 10, 2017 09:14
Show Gist options
  • Save hn5092/7c84f33d395967a2cb33d9b6ba6342bb to your computer and use it in GitHub Desktop.
Save hn5092/7c84f33d395967a2cb33d9b6ba6342bb to your computer and use it in GitHub Desktop.
hive python udf
#coding=utf-8
import logging
import traceback
import jieba
import jieba.analyse
import redis
import sys
import os
import json
import codecs
#ADD FILE /home/admin/chenyun/tmp/udf_jieba.py;
# select TRANSFORM (feature) using 'python udf_jieba.py' as segment
# from tablename where pt='20160329' limit 10;
try:
reload(sys)
sys.setdefaultencoding('utf-8')
except:
pass
pool = redis.ConnectionPool(host='hostname', port=9000)
r = redis.StrictRedis(connection_pool=pool)
for i in r.lrange("cy_dict_my",0,-1):
split = i.split(",")
jieba.add_word(split[0],split[1])
jieba.enable_parallel(8)
with codecs.open('/tmp/myhive.log', 'w') as log:
try:
for line in sys.stdin:
wordList = list(jieba.cut(line))
for word in wordList:
print word
except Exception as e:
log.write(str(e))
log.write(str(sys.path))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment