Skip to content

Instantly share code, notes, and snippets.

@gt11799
Created January 15, 2018 07:53
Show Gist options
  • Save gt11799/db3257efa9ee458488002f262c6e2fe2 to your computer and use it in GitHub Desktop.
Save gt11799/db3257efa9ee458488002f262c6e2fe2 to your computer and use it in GitHub Desktop.
用于任务相似度的分析
#!coding:utf8
import logging
from sqlalchemy import func
from collections import defaultdict
from itertools import combinations
from time import time
from datetime import datetime
from text_detector import check_similary_cos, remove_punctuation
from falcon.corelibs.store import db
from falcon.models.task.task import Task
from falcon.models.task.task_similary import TaskSimilary, TASK_SIMILARY_BIZ_TYPE
from falcon.wsgi import application as app
SIMILAR_RATE = 0.0
BATCH = 1000
def load_tasks_from_csv():
ret = list()
with open("tasks.csv", "r") as f:
line = f.readline()
while line:
try:
task_no, house_code, mobile, content, updated = line.split(",")
ret.append(dict(task_no=task_no, house_code=house_code,
mobile=mobile, content=content, updated=updated))
except:
pass
finally:
line = f.readline()
return ret
def load_tasks(start=None, end=None, biz_type=TASK_SIMILARY_BIZ_TYPE):
if start is None and end is None:
now = datetime.now()
if now.month != 1:
start = now.replace(
month=now.month - 1, day=21, hour=0, minute=0, second=0)
else:
start = now.replace(
year=now.year - 1, month=12, day=21, hour=0, minute=0, second=0)
end = now.replace(day=20, hour=0, minute=0, second=0)
count = db.session.query(func.count(Task.id)).filter(Task.updated >= start, Task.updated <= end)
if biz_type:
count.filter(Task.business_type == biz_type)
count = count.first()[0]
ret = list()
for offset in range(0, count, BATCH):
query = db.session.query(
# house_code 为 Null 任务, 填入 project_code.
Task.task_no, func.coalesce(
Task.house_code, Task.project_code).label('house_code'), Task.mobile, Task.content,
Task.updated, Task.created
).filter(Task.updated >= start, Task.updated <= end)
if biz_type:
query = query.filter(Task.business_type == biz_type)
for line in query.offset(offset).limit(BATCH).all():
task_no, house_code, mobile, content, updated, created = line
ret.append(dict(task_no=task_no, house_code=house_code,
mobile=mobile, content=content, updated=updated or created))
return ret
def group_tasks(tasks):
"""同一个房号下的任务放在一起"""
codes = []
groups = defaultdict(list)
for t in tasks:
house_code = str(t["house_code"])
codes.append(house_code)
groups[house_code].append(t)
assert len(codes) != len(list(set(codes)))
return groups
def get_group_id(tasks):
updated_dates = []
house_codes = []
for task in tasks:
house_code, updated = task.get('house_code'), task.get('updated')
updated_dates.append(updated)
house_codes.append(house_code)
max_, min_ = max(updated_dates), min(updated_dates)
assert len(set(house_codes)) == 1
return "%s%02d%02d%02d%02d" % (house_code, max_.month, max_.day, min_.month, min_.day)
def check_duplicated_tasks(groups):
"""
对同房号下的任务全组合,两两检查相似度,使用最大值作为可能出现重复任务的指标
"""
ret = list()
for house_code, group in groups.iteritems():
pairs = combinations(group, 2)
temp = list()
temp_key = set()
group_similary = 0.0
for pair in pairs:
t1, t2 = (
remove_punctuation(pair[0]['content']),
remove_punctuation(pair[1]['content']))
similary = check_similary_cos(t1, t2)
group_similary = similary if similary > group_similary else group_similary
# SIMILARY_RATE 是指去出现多少相似度的才纳入,默认全部,SIMILARY_RATE = 0.0
if similary >= SIMILAR_RATE:
for p in pair:
if p['task_no'] not in temp_key:
p.update(similary=group_similary)
temp.append(p)
temp_key.add(p['task_no'])
if len(temp) > 0:
ret.append(temp)
return ret
def save_groups_to_db(groups):
for tasks in groups:
group_id = get_group_id(tasks)
for task in tasks:
if task.get("updated") is None:
exit(1)
data = dict(group_id=group_id, similary=task.get("similary"),
task_no=task.get("task_no"), house_code=task.get("house_code"),
task_updated=task.get("updated"))
TaskSimilary.create(_commit=False, **data)
db.session.commit()
def export_to_csv(data, filename=None):
if not isinstance(data, list):
raise TypeError("data should be a list")
if filename is None:
filename = str(int(time())) + ".csv"
count = 0
with open(filename, "w") as f:
for line in data:
line = ",".join(map(lambda x: str(x).replace("\n", ""), line)) + "\n"
f.write(line)
count += 1
logging.debug("%s lines had been saved to %s" % (count, filename))
if __name__ == "__main__":
with app.test_request_context():
tasks = load_tasks()
groups = group_tasks(tasks)
groups = check_duplicated_tasks(groups)
save_groups_to_db(groups)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment