Skip to content

Instantly share code, notes, and snippets.

@snakers4
Last active January 15, 2017 10:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save snakers4/43c31262d5511607709927cf243fddfd to your computer and use it in GitHub Desktop.
Save snakers4/43c31262d5511607709927cf243fddfd to your computer and use it in GitHub Desktop.
DoubleData illustration
SELECT DISTINCT
count (DISTINCT rl.afisha_rest_id) as distinct_rest_count,
count (DISTINCT r2u.afisha_user_id) as review_user_count,
count (DISTINCT r2u.*) as review_count,
count (DISTINCT a2n.social_network_id) as social_network_users,
count (DISTINCT vsoc.social_network_id) as social_graph_data_count
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
/* Тут все join-ы левые, т.к. очевидно что социальный граф есть у меньшинства */
LEFT JOIN afisha_user_id2social_netw a2n ON a2n."type" = 1 AND a2n.user_id = r2u.afisha_user_id
LEFT JOIN vk_social_attr vsoc ON 'id' || vsoc.social_network_id = a2n.social_network_id
LEFT JOIN vk_id2vk_group u2g ON 'id' || u2g.vk_id = a2n.social_network_id
SELECT
bot.vk_id,
bot.rest_review_count_per_critic as rest_review_count_per_critic,
bot.is_bot as is_bot,
vks.*
FROM
bot_anomaly_dataset bot
JOIN vk_social_attr vks ON bot.vk_id = 'id' || vks.social_network_id
fine casual expensive cheap Max of Кафе Max of Европейская кухня Max of Русская кухня Max of Фастфуд Max of Пицца Max of Итальянская кухня Max of Бары Max of Японская кухня Max of Паста Max of Суши Max of Сэндвичи Max of Кондитерские Max of Бургеры Max of Кофейни Max of Американская кухня Max of Вок Max of Кальяны Max of Грузинская кухня Max of Коктейльная карта Max of Узбекская кухня Max of Блины Max of Пироги Max of Стейки Max of Караоке Max of Хачапури Max of Спортбары Max of Пекарни Max of Китайская кухня Max of Пивные рестораны Max of Пельмени Max of Банкетные залы Max of Столовые Max of Вегетарианское меню Max of Паназиатская кухня Max of Стритфуд Max of Хинкали Max of Азербайджанская кухня Max of Кулинарии Max of Винотеки Max of Армянская кухня Max of Пабы Max of Турецкая кухня Max of Мексиканская кухня Max of Живая музыка Max of Татарская кухня Max of Средиземноморская кухня Max of Чебуреки Max of Французская кухня Max of Немецкая кухня Max of Арабская кухня Max of Украинская кухня Max of Чайные Max of Концерты Max of Ресторанная программа Visa Premium Max of Куда пойти с детьми Max of Английская кухня Max of Корейская кухня Max of Халяльные рестораны Max of Торты на заказ Max of Тайская кухня Max of Рыбные рестораны Max of Гастропабы Max of Где пить пиво Max of Где пить кофе Max of Башкирская кухня Max of Индийская кухня Max of Греческая кухня Max of Чешская кухня Max of Лучшие места с дешевой едой Max of Вьетнамская кухня Max of Где есть бургеры Max of Где поесть ночью Max of Лучшие итальянские рестораны Max of Испанская кухня Max of Осетинская кухня Max of Где лечить похмелье Max of Лучшие рестораны в Петербурге Max of Сербская кухня Max of Где есть горячие супы Max of Лучшие рестораны в Москве Max of Еврейская кухня Max of Где есть мясо Max of Авторская кухня Max of Куда пойти с бабушкой и дедушкой Max of Бельгийская кухня Max of Ливанская кухня Max of Где пить вино Max of Шорт-лист премии Wheretoeat Max of Лучший фастфуд Max of Где есть пиццу Max of Лучшие ирландские пабы Max of Где отметить праздник Max of Лучшие бары в Петербурге Max of Где смотреть футбол в Петербурге Max of Лучшее мороженое Max of Лучшие завтраки в Москве Max of Где смотреть футбол в Москве Max of Где пить сидр Max of Лучшие бары Москвы Max of Уйгурская кухня Max of Где смотреть футбол в Москве2 Max of Лучшие кондитерские Max of Лучшие новые рестораны Max of Лучшие кавказские рестораны Max of Лучшие места на Петроградской стороне Max of Где пить водку Max of Где есть пельмени Max of Где есть суши Max of Где есть устриц Max of Латиноамериканская кухня
SELECT DISTINCT
'http://www.afisha.ru/msk/restaurant/' || rl.afisha_rest_id as rest_url,
rl.rest_name as rest_name,
rl.price as rest_price,
rl.cuisine as cuisine,
rl.tags as tags,
rest_tags_review_count.review_count_per_restaurant as review_count_per_restaurant,
COUNT (DISTINCT rest_tags_review_count.tag) as tags_per_restaurant,
CASE
WHEN rl.afisha_rest_id IN (173934, 20549, 28017, 45152, 61072, 151051, 18785, 197696, 186396, 259664, 22334, 16586, 29762, 20481, 22222, 32875, 40192, 29585, 29688, 41431, 59453, 36750, 18677, 29048, 30600, 22793, 22281, 20482, 20482, 28114, 35958, 30256, 36752, 16700, 20439, 18593, 32591, 32591) THEN 1
ELSE 0
END as novikov,
r2cl.cluster_id as cluster_id,
r2r.rating as rest_rating
FROM
(
SELECT
rl.afisha_rest_id as afisha_rest_id,
regexp_split_to_table(rl.tags, ',') as tag,
count(r2u.afisha_user_id) as review_count_per_restaurant
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
GROUP BY
rl.afisha_rest_id,
regexp_split_to_table(rl.tags, ',')
) rest_tags_review_count
JOIN rest_list rl ON rl.afisha_rest_id = rest_tags_review_count.afisha_rest_id
JOIN rest2cluster r2cl ON r2cl.afisha_rest_id = rl.afisha_rest_id
JOIN rest_rating r2r ON r2r.afisha_rest_id = rl.afisha_rest_id
GROUP BY
'http://www.afisha.ru/msk/restaurant/' || rl.afisha_rest_id,
rl.price,
rl.cuisine,
rl.rest_name,
rest_tags_review_count.review_count_per_restaurant,
rl.tags,
r2cl.cluster_id,
r2r.rating,
CASE
WHEN rl.afisha_rest_id IN (173934, 20549, 28017, 45152, 61072, 151051, 18785, 197696, 186396, 259664, 22334, 16586, 29762, 20481, 22222, 32875, 40192, 29585, 29688, 41431, 59453, 36750, 18677, 29048, 30600, 22793, 22281, 20482, 20482, 28114, 35958, 30256, 36752, 16700, 20439, 18593, 32591, 32591) THEN 1
ELSE 0
END
ORDER BY
rest_tags_review_count.review_count_per_restaurant DESC
/*
vga.group_name as group_name,
vga."сategory" as group_category,
vga.subcategory as subcategory,
'https://vk.com/club' || vga.vk_group_id as url,
u2g.vk_group_id as vk_group_id,
COUNT (DISTINCT r2u.afisha_user_id) as count_afisha_user_id,
COUNT (DISTINCT rl.afisha_rest_id) as count_afisha_rest_id,
vga.subscribers_int
*/
/*
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
JOIN afisha_user_id2social_netw a2n ON a2n."type" = 1 AND a2n.user_id = r2u.afisha_user_id
JOIN vk_id2vk_group u2g ON 'id' || u2g.vk_id = a2n.social_network_id
JOIN vk_group_attr vga ON vga.vk_group_id = u2g.vk_group_id
*/
/*
u2g.vk_group_id,
vga.subscribers_int,
'https://vk.com/club' || vga.vk_group_id,
vga."сategory",
vga.subcategory,
vga.group_name
*/
SELECT DISTINCT
'http://www.afisha.ru/msk/restaurant/' || rl.afisha_rest_id as rest_url,
rl.rest_name as rest_name,
rl.price as rest_price,
rl.cuisine as cuisine,
rl.tags as tags,
rest_tags_review_count.review_count_per_restaurant as review_count_per_restaurant,
COUNT (DISTINCT rest_tags_review_count.tag) as tags_per_restaurant,
CASE
WHEN rl.afisha_rest_id IN (173934, 20549, 28017, 45152, 61072, 151051, 18785, 197696, 186396, 259664, 22334, 16586, 29762, 20481, 22222, 32875, 40192, 29585, 29688, 41431, 59453, 36750, 18677, 29048, 30600, 22793, 22281, 20482, 20482, 28114, 35958, 30256, 36752, 16700, 20439, 18593, 32591, 32591) THEN 1
ELSE 0
END as novikov
FROM
(
SELECT
rl.afisha_rest_id as afisha_rest_id,
regexp_split_to_table(rl.tags, ',') as tag,
count(r2u.afisha_user_id) as review_count_per_restaurant
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
GROUP BY
rl.afisha_rest_id,
regexp_split_to_table(rl.tags, ',')
) rest_tags_review_count
JOIN rest_list rl ON rl.afisha_rest_id = rest_tags_review_count.afisha_rest_id
GROUP BY
'http://www.afisha.ru/msk/restaurant/' || rl.afisha_rest_id,
rl.price,
rl.cuisine,
rl.rest_name,
rest_tags_review_count.review_count_per_restaurant,
rl.tags,
CASE
WHEN rl.afisha_rest_id IN (173934, 20549, 28017, 45152, 61072, 151051, 18785, 197696, 186396, 259664, 22334, 16586, 29762, 20481, 22222, 32875, 40192, 29585, 29688, 41431, 59453, 36750, 18677, 29048, 30600, 22793, 22281, 20482, 20482, 28114, 35958, 30256, 36752, 16700, 20439, 18593, 32591, 32591) THEN 1
ELSE 0
END
ORDER BY
rest_tags_review_count.review_count_per_restaurant DESC
SELECT
COUNT(rl.afisha_rest_id) as restaurant_count,
COUNT(r2u.afisha_user_id) as review_user_count,
rl.cuisine as cuisine,
--array_length(regexp_split_to_array(rl.tags, ','), 1) as tag_count,
CASE
WHEN rl.price = 'До 700 рублей' THEN 1
ELSE 0
END as cheap,
CASE
WHEN rl.price = '700–1500 рублей' THEN 1
ELSE 0
END as casual,
CASE
WHEN rl.price = '1500–2500 рублей' THEN 1
ELSE 0
END as fine,
CASE
WHEN rl.price = 'Больше 2500 рублей' THEN 1
ELSE 0
END as expensive
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
GROUP BY
rl.cuisine,
rl.price
--array_length(regexp_split_to_array(rl.tags, ','), 1)
ORDER BY
restaurant_count DESC
SELECT DISTINCT
a2n.afisha_user_id as afisha_user_id,
count(DISTINCT r2u.afisha_rest_id) as rest_review_count_per_critic,
'https://vk.com/' || a2n.social_network_id as vk_url
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
/* Тут все join-ы левые, т.к. очевидно что социальный граф есть у меньшинства */
LEFT JOIN afisha_user_id2social_netw a2n ON a2n."type" = 1 AND a2n.user_id = r2u.afisha_user_id
LEFT JOIN vk_social_attr vsoc ON 'id' || vsoc.social_network_id = a2n.social_network_id
GROUP BY
a2n.afisha_user_id,
'https://vk.com/' || a2n.social_network_id
ORDER BY
count(DISTINCT r2u.afisha_rest_id) DESC
SELECT
*
FROM
(
SELECT
rl.afisha_rest_id as afisha_rest_id,
regexp_split_to_table(rl.tags, ',') as tags,
rl.price as price
/* Сюда так и просится расстояние до метро...и индикатор центра города */
FROM
rest_list rl
) raw
WHERE
raw.tags <> '0' AND raw.price <> '0'
SELECT
rl.afisha_rest_id as afisha_rest_id,
/* тут все очень резко, просто и arbitrary - можно делать кривую мягче и настроить под клиента */
(log(rest_tags_review_count.review_count_per_restaurant + 1)^0.1/1.06) as reviews_score,
(count(DISTINCT a2n.social_network_id)::NUMERIC / (count(a2n.*)::NUMERIC +1) ) as social_score,
avg(vkf.group_fit) as group_fit_score
FROM
(
SELECT
rl.afisha_rest_id as afisha_rest_id,
count(r2u.afisha_user_id) as review_count_per_restaurant
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
GROUP BY
rl.afisha_rest_id,
regexp_split_to_table(rl.tags, ',')
) rest_tags_review_count
JOIN rest_list rl ON rl.afisha_rest_id = rest_tags_review_count.afisha_rest_id
LEFT JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
LEFT JOIN afisha_user_id2social_netw a2n ON a2n."type" = 1 AND a2n.user_id = r2u.afisha_user_id
LEFT JOIN vk_id2vk_group u2g ON 'id' || u2g.vk_id = a2n.social_network_id
LEFT JOIN vk_group_fit vkf ON vkf.vk_group_id = u2g.vk_group_id
GROUP BY
rl.afisha_rest_id,
(log(rest_tags_review_count.review_count_per_restaurant + 1)^0.1/1.06)
LIMIT 1000
SELECT
raw_data.afisha_rest_id as afisha_rest_id,
raw_data.reviews_score as reviews_score,
round (log(1/(raw_data.social_score+0.01))/3 :: NUMERIC, 2) as social_score,
raw_data.group_fit_score as group_fit_score
FROM
(
SELECT
rl.afisha_rest_id as afisha_rest_id,
/* тут все очень резко, просто и arbitrary - можно делать кривую мягче и настроить под клиента */
round( (log(rest_tags_review_count.review_count_per_restaurant + 1)^0.1/1.06) :: NUMERIC, 3) as reviews_score,
round( (count(DISTINCT a2n.social_network_id)::NUMERIC / (count(a2n.*)::NUMERIC +1) ), 3) as social_score,
round( avg(vkf.group_fit) :: NUMERIC, 3) as group_fit_score
FROM
(
SELECT
rl.afisha_rest_id as afisha_rest_id,
count(r2u.afisha_user_id) as review_count_per_restaurant
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
GROUP BY
rl.afisha_rest_id,
regexp_split_to_table(rl.tags, ',')
) rest_tags_review_count
JOIN rest_list rl ON rl.afisha_rest_id = rest_tags_review_count.afisha_rest_id
LEFT JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
LEFT JOIN afisha_user_id2social_netw a2n ON a2n."type" = 1 AND a2n.user_id = r2u.afisha_user_id
LEFT JOIN vk_id2vk_group u2g ON 'id' || u2g.vk_id = a2n.social_network_id
LEFT JOIN vk_group_fit vkf ON vkf.vk_group_id = u2g.vk_group_id
GROUP BY
rl.afisha_rest_id,
(log(rest_tags_review_count.review_count_per_restaurant + 1)^0.1/1.06)
) raw_data
SELECT DISTINCT
rest_tags_review_count.tag as tag,
count(rest_tags_review_count.afisha_rest_id) as rest_count_per_tag
FROM
(
SELECT
rl.afisha_rest_id as afisha_rest_id,
regexp_split_to_table(rl.tags, ',') as tag,
count(r2u.afisha_user_id) as review_count_per_restaurant
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
GROUP BY
rl.afisha_rest_id,
regexp_split_to_table(rl.tags, ',')
) rest_tags_review_count
JOIN rest_list rl ON rl.afisha_rest_id = rest_tags_review_count.afisha_rest_id
GROUP BY
rest_tags_review_count.tag
ORDER BY
count(rest_tags_review_count.afisha_rest_id) DESC
SELECT DISTINCT
'https://vk.com/club' || vga.vk_group_id as url,
u2g.vk_group_id as vk_group_id,
COUNT (DISTINCT r2u.afisha_user_id) as count_afisha_user_id,
COUNT (DISTINCT rl.afisha_rest_id) as count_afisha_rest_id,
vga.subscribers_int
FROM
rest_list rl
JOIN rest_id2afisha_user_id r2u ON r2u.afisha_rest_id = rl.afisha_rest_id
JOIN afisha_user_id2social_netw a2n ON a2n."type" = 1 AND a2n.user_id = r2u.afisha_user_id
JOIN vk_id2vk_group u2g ON 'id' || u2g.vk_id = a2n.social_network_id
JOIN vk_group_attr vga ON vga.vk_group_id = u2g.vk_group_id
GROUP BY
u2g.vk_group_id,
vga.subscribers_int,
'https://vk.com/club' || vga.vk_group_id
ORDER BY
count_afisha_user_id DESC
SELECT
count (vkg.*) as total_groups,
sum(CASE WHEN vkg.subscribers_int NOTNULL THEN 1 ELSE 0 END) as not_null_subscribers,
sum(CASE WHEN vkg.visitors_int NOTNULL THEN 1 ELSE 0 END) as not_null_visitors,
sum(CASE WHEN vkg.coverage_int NOTNULL THEN 1 ELSE 0 END) as not_null_coverage
FROM
vk_group_attr vkg
SELECT
corr(vkg.subscribers_int,vkg.visitors_int) subscribers_int_visitors_int,
corr(vkg.subscribers_int,vkg.coverage_int),
corr(vkg.visitors_int,vkg.coverage_int)
/*
vkg.subscribers_int
vkg.visitors_int
vkg.coverage_int
*/
FROM
vk_group_attr vkg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment