Skip to content

Instantly share code, notes, and snippets.

@mado-m
Last active December 2, 2018 04:12
Show Gist options
  • Save mado-m/068b320a68429dc9c88e9a195155127d to your computer and use it in GitHub Desktop.
Save mado-m/068b320a68429dc9c88e9a195155127d to your computer and use it in GitHub Desktop.
import csv
import re
import time
from urllib.parse import urlparse, parse_qs
LOG_HEAD_SIZE = 50
SPLIT_DELIMITER = '[ |\+]+'
def main():
# xxx,xxx,xxx,xxx,xxx,xxx,xxx,xxx
# /search?l=東京都,10,10,10.0,10,10.0,10.0,10
# /search?q=土日,10,10,10.0,10,10.0,10.0,10
# /search?et=PART&q=パート 3日 4日 2日&l=福島県+伊達市 福島市&offset=620,10,10,10.0,10,10.0,10.0,10
# /search?q=POP制作+未経験&l=東京都+江東区,10,10,10.0,10,10.0,10.0,10
# "/search?et=PART,VLTR&l=愛媛県大洲市&d=300.0&offset=20",10,10,10.0,10,10.0,10.0,10
print("Start.")
start_time = time.time()
path_view_tuples = []
with open('search.csv') as f:
reader = csv.reader(f)
next(reader) # skip header
for row in reader:
path_view_tuples.append((row[0], row[1]))
print_summary(path_view_tuples, 'path_view_tuples')
query_location_view_tuples = []
for path, view in path_view_tuples:
all_query = parse_qs(urlparse(path).query)
query = all_query['q'][0] if 'q' in all_query else ''
location = all_query['l'][0] if 'l' in all_query else ''
query_location_view_tuples.append((query, location, view))
print_summary(query_location_view_tuples, 'query_location_view_tuples')
split_query_location_view_tuples = []
for query, location, view in query_location_view_tuples:
split_query = sorted(re.split(SPLIT_DELIMITER, query.strip(SPLIT_DELIMITER)))
split_location = sorted(re.split(SPLIT_DELIMITER, location.strip(SPLIT_DELIMITER)))
split_query_location_view_tuples.append((split_query, split_location, view))
print_summary(split_query_location_view_tuples, 'split_query_location_view_tuples')
unique_query_location_view_tuples = []
for queries, locations, view in split_query_location_view_tuples:
index = get_index(unique_query_location_view_tuples, queries, locations)
if index > 0:
sum_view = int(unique_query_location_view_tuples[index][2]) + int(view)
unique_query_location_view_tuples.pop(index)
unique_query_location_view_tuples.append((queries, locations, sum_view))
else:
unique_query_location_view_tuples.append((queries, locations, int(view)))
unique_query_location_view_tuples.sort(key=lambda t: t[2], reverse=True)
print_summary(unique_query_location_view_tuples, 'unique_query_location_view_tuples')
with open('ql.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(("q", "l", "count"))
for unique_query_location_view_tuple in unique_query_location_view_tuples:
writer.writerow((
" ".join(unique_query_location_view_tuple[0]),
" ".join(unique_query_location_view_tuple[1]),
unique_query_location_view_tuple[2]))
print('End. time=[%d]s' % int(time.time() - start_time))
def get_index(tuples, unique_queries, unique_locations):
for i, unique_query_location_view_tuple in enumerate(tuples):
split_queries = unique_query_location_view_tuple[0]
split_locations = unique_query_location_view_tuple[1]
if split_queries == unique_queries and split_locations == unique_locations:
return i
return -1
def print_summary(targets, targets_name):
print('%s.size=[%d]' % (targets_name, len(targets)))
print('%s.head=[%s]' % (targets_name, targets[:LOG_HEAD_SIZE]))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment