Created
January 15, 2018 07:53
-
-
Save gt11799/db3257efa9ee458488002f262c6e2fe2 to your computer and use it in GitHub Desktop.
用于任务相似度的分析
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!coding:utf8 | |
import logging | |
from sqlalchemy import func | |
from collections import defaultdict | |
from itertools import combinations | |
from time import time | |
from datetime import datetime | |
from text_detector import check_similary_cos, remove_punctuation | |
from falcon.corelibs.store import db | |
from falcon.models.task.task import Task | |
from falcon.models.task.task_similary import TaskSimilary, TASK_SIMILARY_BIZ_TYPE | |
from falcon.wsgi import application as app | |
SIMILAR_RATE = 0.0 | |
BATCH = 1000 | |
def load_tasks_from_csv(): | |
ret = list() | |
with open("tasks.csv", "r") as f: | |
line = f.readline() | |
while line: | |
try: | |
task_no, house_code, mobile, content, updated = line.split(",") | |
ret.append(dict(task_no=task_no, house_code=house_code, | |
mobile=mobile, content=content, updated=updated)) | |
except: | |
pass | |
finally: | |
line = f.readline() | |
return ret | |
def load_tasks(start=None, end=None, biz_type=TASK_SIMILARY_BIZ_TYPE): | |
if start is None and end is None: | |
now = datetime.now() | |
if now.month != 1: | |
start = now.replace( | |
month=now.month - 1, day=21, hour=0, minute=0, second=0) | |
else: | |
start = now.replace( | |
year=now.year - 1, month=12, day=21, hour=0, minute=0, second=0) | |
end = now.replace(day=20, hour=0, minute=0, second=0) | |
count = db.session.query(func.count(Task.id)).filter(Task.updated >= start, Task.updated <= end) | |
if biz_type: | |
count.filter(Task.business_type == biz_type) | |
count = count.first()[0] | |
ret = list() | |
for offset in range(0, count, BATCH): | |
query = db.session.query( | |
# house_code 为 Null 任务, 填入 project_code. | |
Task.task_no, func.coalesce( | |
Task.house_code, Task.project_code).label('house_code'), Task.mobile, Task.content, | |
Task.updated, Task.created | |
).filter(Task.updated >= start, Task.updated <= end) | |
if biz_type: | |
query = query.filter(Task.business_type == biz_type) | |
for line in query.offset(offset).limit(BATCH).all(): | |
task_no, house_code, mobile, content, updated, created = line | |
ret.append(dict(task_no=task_no, house_code=house_code, | |
mobile=mobile, content=content, updated=updated or created)) | |
return ret | |
def group_tasks(tasks): | |
"""同一个房号下的任务放在一起""" | |
codes = [] | |
groups = defaultdict(list) | |
for t in tasks: | |
house_code = str(t["house_code"]) | |
codes.append(house_code) | |
groups[house_code].append(t) | |
assert len(codes) != len(list(set(codes))) | |
return groups | |
def get_group_id(tasks): | |
updated_dates = [] | |
house_codes = [] | |
for task in tasks: | |
house_code, updated = task.get('house_code'), task.get('updated') | |
updated_dates.append(updated) | |
house_codes.append(house_code) | |
max_, min_ = max(updated_dates), min(updated_dates) | |
assert len(set(house_codes)) == 1 | |
return "%s%02d%02d%02d%02d" % (house_code, max_.month, max_.day, min_.month, min_.day) | |
def check_duplicated_tasks(groups): | |
""" | |
对同房号下的任务全组合,两两检查相似度,使用最大值作为可能出现重复任务的指标 | |
""" | |
ret = list() | |
for house_code, group in groups.iteritems(): | |
pairs = combinations(group, 2) | |
temp = list() | |
temp_key = set() | |
group_similary = 0.0 | |
for pair in pairs: | |
t1, t2 = ( | |
remove_punctuation(pair[0]['content']), | |
remove_punctuation(pair[1]['content'])) | |
similary = check_similary_cos(t1, t2) | |
group_similary = similary if similary > group_similary else group_similary | |
# SIMILARY_RATE 是指去出现多少相似度的才纳入,默认全部,SIMILARY_RATE = 0.0 | |
if similary >= SIMILAR_RATE: | |
for p in pair: | |
if p['task_no'] not in temp_key: | |
p.update(similary=group_similary) | |
temp.append(p) | |
temp_key.add(p['task_no']) | |
if len(temp) > 0: | |
ret.append(temp) | |
return ret | |
def save_groups_to_db(groups): | |
for tasks in groups: | |
group_id = get_group_id(tasks) | |
for task in tasks: | |
if task.get("updated") is None: | |
exit(1) | |
data = dict(group_id=group_id, similary=task.get("similary"), | |
task_no=task.get("task_no"), house_code=task.get("house_code"), | |
task_updated=task.get("updated")) | |
TaskSimilary.create(_commit=False, **data) | |
db.session.commit() | |
def export_to_csv(data, filename=None): | |
if not isinstance(data, list): | |
raise TypeError("data should be a list") | |
if filename is None: | |
filename = str(int(time())) + ".csv" | |
count = 0 | |
with open(filename, "w") as f: | |
for line in data: | |
line = ",".join(map(lambda x: str(x).replace("\n", ""), line)) + "\n" | |
f.write(line) | |
count += 1 | |
logging.debug("%s lines had been saved to %s" % (count, filename)) | |
if __name__ == "__main__": | |
with app.test_request_context(): | |
tasks = load_tasks() | |
groups = group_tasks(tasks) | |
groups = check_duplicated_tasks(groups) | |
save_groups_to_db(groups) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment