Skip to content

Instantly share code, notes, and snippets.

@jobelenus
Last active June 12, 2018 21:57
Show Gist options
  • Save jobelenus/e52b3c8cd86d07eca88c90fc50ec3332 to your computer and use it in GitHub Desktop.
Save jobelenus/e52b3c8cd86d07eca88c90fc50ec3332 to your computer and use it in GitHub Desktop.
How to get a set of KNOWN_GOOD
class SourceDimension(models.Model):
source_id = models.IntegerField(db_index=True)
name = models.CharField(max_length=256, db_index=True)
category = models.CharField(max_length=128, default='')
fuzz_grouped_source_name = models.CharField(max_length=256, default='')
fuzz_ratio = models.PositiveIntegerField(default=0)
KNOWN_GOOD = [
# These are first on purpose
'Edmunds CarCode', 'Edmunds Price Promise', 'TrueCar/USAA',
# generally we want the MOST specific first, least specific last
'Dealer.com', 'Cars.com', 'CarGurus', 'TrueCar', 'Edmunds',
'Carloan.com', 'Kelley Blue Book', 'Third Party Core',
'CARFAX', 'DealerFire', 'Costco', 'Tdds',
'TradePending', 'Jazel', 'PureCars'
]
DEFAULT_THRESHOLD = 85
@staticmethod
def find_source_name_commonalities(delimiter=',', max_length=5):
"""
from strolid_reporting.models import SourceDimension
counter = SourceDimension.find_source_name_commonalities()
Note: this MUST return the values longest first (assuming longer = more specific, see comment above in KNOWN_GOOD)
"""
counter = Counter()
for name in SourceDimension.objects.exclude(name='').values_list('name', flat=True):
tokens = name.split(delimiter)
tokens = list(filter(lambda x: True if len(x) > max_length else False, tokens))
tokens = list(map(str.strip, tokens))
counter.update(tokens)
return counter
@staticmethod
def update_fuzz(threshold=None):
"""
from strolid_reporting.models import SourceDimension
SourceDimension.update_fuzz()
d = SourceDimension.objects.exclude(fuzz_ratio=0).order_by('fuzz_ratio').first()
print(d.name, '|', d.fuzz_grouped_source_name, '|', d.fuzz_ratio)
>>> SourceDimension.objects.filter(fuzz_ratio=0).count()
2851
>>> SourceDimension.objects.exclude(fuzz_ratio=0).count()
1943
"""
SourceDimension.objects.update(fuzz_grouped_source_name='', fuzz_ratio=0)
if not threshold:
threshold = SourceDimension.DEFAULT_THRESHOLD
for dim in SourceDimension.objects.exclude(name=''):
if dim.name == 'Strolid Phone Up':
dim.fuzz_grouped_source_name = 'Strolid Phone Up'
dim.fuzz_ratio = 100
elif dim.name == 'Strolid Chat':
dim.fuzz_grouped_source_name = 'Strolid Chat'
dim.fuzz_ratio = 100
else:
for grouped_name in SourceDimension.KNOWN_GOOD:
ratio = fuzz.partial_ratio(grouped_name, dim.name)
# don't replace unless its better than threshold or what we have (and dont overwrite 100s)!
if ratio > threshold and ratio > dim.fuzz_ratio and dim.fuzz_ratio != 100:
dim.fuzz_grouped_source_name = grouped_name
dim.fuzz_ratio = ratio
dim.save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment