Last active
December 2, 2018 04:12
-
-
Save mado-m/068b320a68429dc9c88e9a195155127d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
import time | |
from urllib.parse import urlparse, parse_qs | |
LOG_HEAD_SIZE = 50 | |
SPLIT_DELIMITER = '[ |\+]+' | |
def main(): | |
# xxx,xxx,xxx,xxx,xxx,xxx,xxx,xxx | |
# /search?l=東京都,10,10,10.0,10,10.0,10.0,10 | |
# /search?q=土日,10,10,10.0,10,10.0,10.0,10 | |
# /search?et=PART&q=パート 3日 4日 2日&l=福島県+伊達市 福島市&offset=620,10,10,10.0,10,10.0,10.0,10 | |
# /search?q=POP制作+未経験&l=東京都+江東区,10,10,10.0,10,10.0,10.0,10 | |
# "/search?et=PART,VLTR&l=愛媛県大洲市&d=300.0&offset=20",10,10,10.0,10,10.0,10.0,10 | |
print("Start.") | |
start_time = time.time() | |
path_view_tuples = [] | |
with open('search.csv') as f: | |
reader = csv.reader(f) | |
next(reader) # skip header | |
for row in reader: | |
path_view_tuples.append((row[0], row[1])) | |
print_summary(path_view_tuples, 'path_view_tuples') | |
query_location_view_tuples = [] | |
for path, view in path_view_tuples: | |
all_query = parse_qs(urlparse(path).query) | |
query = all_query['q'][0] if 'q' in all_query else '' | |
location = all_query['l'][0] if 'l' in all_query else '' | |
query_location_view_tuples.append((query, location, view)) | |
print_summary(query_location_view_tuples, 'query_location_view_tuples') | |
split_query_location_view_tuples = [] | |
for query, location, view in query_location_view_tuples: | |
split_query = sorted(re.split(SPLIT_DELIMITER, query.strip(SPLIT_DELIMITER))) | |
split_location = sorted(re.split(SPLIT_DELIMITER, location.strip(SPLIT_DELIMITER))) | |
split_query_location_view_tuples.append((split_query, split_location, view)) | |
print_summary(split_query_location_view_tuples, 'split_query_location_view_tuples') | |
unique_query_location_view_tuples = [] | |
for queries, locations, view in split_query_location_view_tuples: | |
index = get_index(unique_query_location_view_tuples, queries, locations) | |
if index > 0: | |
sum_view = int(unique_query_location_view_tuples[index][2]) + int(view) | |
unique_query_location_view_tuples.pop(index) | |
unique_query_location_view_tuples.append((queries, locations, sum_view)) | |
else: | |
unique_query_location_view_tuples.append((queries, locations, int(view))) | |
unique_query_location_view_tuples.sort(key=lambda t: t[2], reverse=True) | |
print_summary(unique_query_location_view_tuples, 'unique_query_location_view_tuples') | |
with open('ql.csv', 'w') as f: | |
writer = csv.writer(f) | |
writer.writerow(("q", "l", "count")) | |
for unique_query_location_view_tuple in unique_query_location_view_tuples: | |
writer.writerow(( | |
" ".join(unique_query_location_view_tuple[0]), | |
" ".join(unique_query_location_view_tuple[1]), | |
unique_query_location_view_tuple[2])) | |
print('End. time=[%d]s' % int(time.time() - start_time)) | |
def get_index(tuples, unique_queries, unique_locations): | |
for i, unique_query_location_view_tuple in enumerate(tuples): | |
split_queries = unique_query_location_view_tuple[0] | |
split_locations = unique_query_location_view_tuple[1] | |
if split_queries == unique_queries and split_locations == unique_locations: | |
return i | |
return -1 | |
def print_summary(targets, targets_name): | |
print('%s.size=[%d]' % (targets_name, len(targets))) | |
print('%s.head=[%s]' % (targets_name, targets[:LOG_HEAD_SIZE])) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment