Skip to content

Instantly share code, notes, and snippets.

@lxxstc
Last active December 14, 2015 01:09
Show Gist options
  • Save lxxstc/5004259 to your computer and use it in GitHub Desktop.
Save lxxstc/5004259 to your computer and use it in GitHub Desktop.
Hive Job With Python Script
对AV数据进行去重操作.
HQL中使用了 DISTRIBUTE BY 和 SORT BY
reduce阶段使用了python脚本进行去重操作
add file reduce.py;
FROM (
select *
from
(
select
dep_airport,
arr_airport,
dep_time,
arr_time,
--unix_timestamp(concat(to_date(dep_time), ' 00:00:00')) dep_date,
--unix_timestamp(concat(to_date(arr_time), ' 00:00:00')) arr_date,
round(dep_time/86400) dep_date,
round(arr_time/86400) arr_date,
act_code[0] act_code,
act_carrier[0] act_carrier,
carrier[0] carrier,
code[0] code
from av
where
dep_time > 1361289600
and size(act_code) = 1
) a
DISTRIBUTE BY
code,
dep_date
SORT BY
code,
dep_date desc
) av_table
INSERT into TABLE av_direct_uniq PARTITION (dt='2013-02-21')
reduce
av_table.dep_airport,
av_table.arr_airport,
av_table.dep_time,
av_table.arr_time,
av_table.dep_date,
av_table.arr_date,
av_table.act_code,
av_table.act_carrier,
av_table.carrier,
av_table.code
using 'reduce.py'
as
dep_airport,
arr_airport,
dep_time,
arr_time,
dep_date,
arr_date,
act_code,
act_carrier,
carrier,
code;
#!/usr/bin/env python
import sys
k = ''
for line in sys.stdin:
line = line.strip()
items = line.split('\t')
dep_date = items[4]
code = items[-1]
tmp_key = dep_date + code
if k == tmp_key:
continue
k = tmp_key
print line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment