Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
计算文本 tf-idf
import math
from datetime import datetime, timedelta
from django.conf import settings
from django.core.management.base import BaseCommand
from apps.articles.models import Article
idf_path = getattr(settings, "IDF_PATH", None)
class Command(BaseCommand):
help = "gen article tfidf file"
def _is_number(self, word):
try:
float(word)
except ValueError:
return False
return True
def add_arguments(self, parser):
parser.add_argument("-d", "--delta", type=int, help="cal", default=30)
def handle(self, *args, **options):
days = options["delta"]
word2count = {}
since = datetime.now() - timedelta(days=days)
_count = Article.objects(published_at__gte=since).count()
for row in Article.objects(published_at__gte=since):
seg_list = row.gen_idf()
for word in seg_list:
word = word.strip()
if word == "" or self._is_number(word):
continue
if word not in word2count:
word2count[word] = 1
else:
word2count[word] += 1
idf_file = open(idf_path, mode="w", encoding="utf-8")
for word, df in word2count.items():
_ = "%s %.9f\n" % (word, math.log(_count / df))
idf_file.write(_)
idf_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.