Skip to content

Instantly share code, notes, and snippets.

@joest67
Created May 23, 2016 14:31
Show Gist options
  • Save joest67/d4962b81176b8fc7269cf39de9552a64 to your computer and use it in GitHub Desktop.
Save joest67/d4962b81176b8fc7269cf39de9552a64 to your computer and use it in GitHub Desktop.
We can make this file beautiful and searchable if this error is corrected: It looks like row 3 should actually have 8 columns, instead of 7. in line 2.
430972,5037,6,52,2600.0,0,用,c
352986,4498,5,12,600.0,0,对,-
1399015,26982,569,12,600.0,0,-
678560,2954497,11,9,450.0,1,用,扫
331,18779,2,9,450.0,0,用,地址,-,win
135200,3568182,2,6,300.0,1,端口,路由,转发
992422,21469,36,6,300.0,0,脚本,存,window,调用
142786,26725,3,6,300.0,0,网,地址,-
281,18243,2,5,250.0,0,屏蔽,端口,路由,路由器
701894,22114,12,4,200.0,0,用
175459,2953509,3,3,150.0,1,扫,端口,过
353306,22192,5,2,100.0,0,对
308691,3140498,4,2,100.0,1,地址,ok
867063,3573899,20,1,50.0,1,肉鸡
238,17821,2,1,50.0,0,会
41083,2943788,2,1,50.0,1,放
431330,24388,6,1,50.0,0,用户
429021,3578509,5,1,50.0,1,入侵
1409102,3569938,619,1,50.0,1,防护
215532,3185471,3,1,50.0,1,点
761486,3152196,14,0,0.0,1
1183309,9020,110,0,0.0,0
562406,2964500,8,0,0.0,1
1091680,14378,61,0,0.0,0
841050,10781,19,0,0.0,0
1167932,9006,99,0,0.0,0
431300,23607,6,0,0.0,0
967507,25445,32,0,0.0,0
333992,3416846,4,0,0.0,1
431306,23834,6,0,0.0,0
1369255,26982,443,0,0.0,0
91997,3183133,2,0,0.0,1
841013,5099,19,0,0.0,0
1052881,24048,49,0,0.0,0
1396025,26982,555,0,0.0,0
1003482,5099,38,0,0.0,0
1060222,20082,51,0,0.0,0
1298224,9006,254,0,0.0,0
142589,23834,3,0,0.0,0
596679,7937,9,0,0.0,0
1184613,9006,111,0,0.0,0
773607,26932,15,0,0.0,0
1070490,13938,54,0,0.0,0
752259,8017,14,0,0.0,0
142486,22262,3,0,0.0,0
953309,8017,30,0,0.0,0
1252167,26982,180,0,0.0,0
259016,28402,4,0,0.0,0
792768,21243,16,0,0.0,0
792785,23153,16,0,0.0,0
986516,9202,35,0,0.0,0
1293365,26692,245,0,0.0,0
192035,3050687,3,0,0.0,1
1036872,8863,45,0,0.0,0
1097117,7937,63,0,0.0,0
550381,8095,8,0,0.0,0
518078,3086023,7,0,0.0,1
416,20281,2,0,0.0,0
752409,27074,14,0,0.0,0
582093,3306162,8,0,0.0,1
431098,17350,6,0,0.0,0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
usage:
python sample.py ddos_ch_200.csv hackbase_sample0.csv 123.csv
"""
import csv
import argparse
from collections import (
defaultdict,
namedtuple
)
import jieba
SourceFrame = namedtuple(
'SourceFrame',
['index_key', 'postid', 'floor', 'sentence', 'manual_related']
)
DataFrame = namedtuple(
'DataFrame',
['postid', 'floor', 'freq', 'score', 'manual', 'keywords']
)
class KeyWordFrequentClient(object):
encodings = ['utf-8', 'gbk']
def __init__(self, sample_file, test_filepath, result_filepath,
strict_mode=True):
self.sample_file = sample_file
self.test_filepath = test_filepath
self.result_filepath = result_filepath
self.strict_mode = strict_mode
def guess_encoding(self, word):
for i in self.encodings:
try:
word.decode(i)
except:
pass
else:
return i
def init_sample_data(self, sample_input_file):
keyword_score_map = defaultdict(int)
for word, weight, num in csv.reader(sample_input_file):
if isinstance(word, basestring):
encoding = self.guess_encoding(word)
word = word.decode(encoding)
keyword_score_map[word] = float(weight)
return keyword_score_map
def hit_keywords(self, keyword, candidates):
if self.strict_mode:
return filter(lambda c: c == keyword, candidates)
else:
return filter(lambda c: c in keyword, candidates)
def calculate_sentence_score(self, sentence, keyword_score_map):
parts = list(jieba.cut_for_search(sentence)) # only Chinese sentence
score = 0
freq = 0
hit_keywords = set()
for keyword, weight in keyword_score_map.iteritems():
hit_parts = self.hit_keywords(keyword, parts)
freq += len(hit_parts)
score += len(hit_parts) * weight
hit_keywords |= set(hit_parts)
return freq, score, list(hit_keywords)
def process_data(self, keyword_score_map, input_file):
res = {}
for line in csv.reader(input_file):
if len(line) < len(SourceFrame._fields):
continue
source_frame = SourceFrame(*line)
freq, score, hit_keywords = self.calculate_sentence_score(
source_frame.sentence,
keyword_score_map
)
data = [
source_frame.postid, source_frame.floor, freq, score,
source_frame.manual_related, hit_keywords,
]
res[source_frame.index_key] = DataFrame(*data)
return res
def write_result(self, res):
# sort by score default
sorted_line_map = sorted(list(res.iteritems()),
key=lambda l: l[1].score, reverse=True)
with open(self.result_filepath, 'wb') as f:
csv_writer = csv.writer(f)
for idx, frame in sorted_line_map:
data = [idx] + list(frame[:-1]) + \
[k.encode('utf-8') for k in list(frame[-1])]
csv_writer.writerow(data)
def run(self):
with open(self.sample_file) as sample_input:
keyword_score_map = self.init_sample_data(sample_input)
with open(self.test_filepath) as input_file:
res = self.process_data(keyword_score_map, input_file)
self.write_result(res)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('sample_file', help='sample file path')
parser.add_argument('test_file', help='test file path')
parser.add_argument('save_filepath', help='test file path')
parser.add_argument('-s', '--strict-mode', action='store_false',
default=False, help='exact equal with sample keyword')
args = parser.parse_args()
print args
client = KeyWordFrequentClient(
args.sample_file, args.test_file, args.save_filepath,
args.strict_mode
)
client.run()
print 'done'
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment