Skip to content

Instantly share code, notes, and snippets.

@mac2000
Created August 19, 2017 18:47
Show Gist options
  • Save mac2000/3f4f460d441aa41a0dfa3fa8e8fc443d to your computer and use it in GitHub Desktop.
Save mac2000/3f4f460d441aa41a0dfa3fa8e8fc443d to your computer and use it in GitHub Desktop.
bigquery recommendations based on user ratings
WITH d AS (
-- should return (cid - user id, vid - item id, rating)
SELECT
clientId as cid,
REGEXP_EXTRACT(page.pagePath, r'/company\d+/vacancy(\d+)') AS vid,
MAX(
CASE
WHEN page.pagePath LIKE '%apply=thanks%' THEN 5
WHEN page.pagePath LIKE '%mode=apply%' THEN 2
ELSE 1
END
) AS rating
FROM `majestic-cairn-171208`.bigdata.ga WHERE
_PARTITIONTIME BETWEEN TIMESTAMP_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 * 24 HOUR),DAY) AND TIMESTAMP_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 * 24 HOUR),DAY)
AND type = 'pageview'
AND page.pagePath LIKE '/company%/vacancy%'
GROUP BY cid, vid
), v AS (
-- optional, just remove join in last query, should return (vid - item id, title)
SELECT DISTINCT
REGEXP_EXTRACT(page.pagePath, r'/company\d+/vacancy(\d+)') AS vid,
REGEXP_REPLACE(REGEXP_EXTRACT(page.title, r'(.+) в .+ - .+ | Rabota.ua'), r'\d+ грн$', '') AS title
FROM `majestic-cairn-171208`.bigdata.ga WHERE
_PARTITIONTIME BETWEEN TIMESTAMP_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 * 24 HOUR),DAY) AND TIMESTAMP_TRUNC(TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 * 24 HOUR),DAY)
AND type = 'pageview'
AND page.pagePath LIKE '/company%/vacancy%'
AND page.pagePath NOT LIKE '%mode=apply%'
AND page.pagePath NOT LIKE '%apply=thanks%'
), c AS (
-- correlation
SELECT a.vid as vid, b.vid as similar_vid, CORR(a.rating, b.rating) as correlation, count(*) as `count`
FROM d AS a
JOIN d AS b
ON a.cid = b.cid
WHERE a.vid <> b.vid
GROUP BY a.vid, b.vid
)
-- get related items to desc
select c.*, v.title from c
join v on c.similar_vid = v.vid
where c.vid = '6849385'
order by `count` desc
limit 10
@mac2000
Copy link
Author

mac2000 commented Jan 9, 2018

SELECT a.vid as vid, b.vid as similar_vid, CORR(a.rating, b.rating) as correlation, count(*) as count
FROM report.uid2vid AS a
JOIN report.uid2vid AS b
ON a.uid = b.uid
WHERE a._PARTITIONTIME >= '2018-01-01 00:00:00' AND b._PARTITIONTIME >= '2018-01-01 00:00:00'
AND a.vid <> b.vid
GROUP BY a.vid, b.vid
HAVING
  correlation IS NOT NULL
  AND count > 2
  ORDER BY correlation DESC
limit 100

@mac2000
Copy link
Author

mac2000 commented Jan 10, 2018

WITH d AS (
select 'mac' as user, 'beer' as item, 5.0 as rating
UNION ALL
select 'mac' as user, 'vine' as item, 1.0 as rating
UNION ALL
select 'mac' as user, 'konjak' as item, 5.0 as rating
-- UNION ALL
-- select 'mac' as user, 'viski' as item, 1.0 as rating

union all

select 'michael' as user, 'beer' as item, 3.0 as rating
UNION ALL
select 'michael' as user, 'vine' as item, 1.0 as rating
UNION ALL
select 'michael' as user, 'konjak' as item, 4.0 as rating
UNION ALL
select 'michael' as user, 'viski' as item, 4.0 as rating

union all

select 'maria' as user, 'beer' as item, 3.0 as rating
UNION ALL
select 'maria' as user, 'vine' as item, 5.0 as rating
UNION ALL
select 'maria' as user, 'konjak' as item, 1.0 as rating
UNION ALL
select 'maria' as user, 'viski' as item, 1.0 as rating
), i2i as (

SELECT a.item as item, b.item as similar, CORR(a.rating, b.rating) as correlation, count(*) as count
FROM d AS a
JOIN d AS b
ON a.user = b.user
WHERE a.item <> b.item
GROUP BY a.item, b.item

), u2u as (

SELECT a.user as user, b.user as similar, CORR(a.rating, b.rating) as correlation, count(*) as count
FROM d AS a
JOIN d AS b
ON a.item = b.item
WHERE a.user <> b.user
GROUP BY a.user, b.user

)

select 
u2u.user as u2u_user,
u2u.similar as u2u_similar,
u2u.correlation as u2u_correlation,
d1.item as d1_item,
d2.item as d2_item,
i2i.correlation as i2i_correlation
from u2u
join d as d1 on u2u.user = d1.user
join d as d2 on u2u.similar = d2.user
left join i2i on i2i.item = d1.item and i2i.similar = d2.item
where u2u.correlation > 0
and i2i.correlation > 0
and d2.item not in (select item from d as x where x.user = d1.user)

-- select 
-- u2u.user as u2u_user,
-- -- u2u.similar as u2u_similar,
-- -- u2u.correlation as u2u_correlation,
-- -- d1.item as d1_item,
-- d2.item as d2_item,
-- i2i.correlation as i2i_correlation
-- from u2u
-- join d as d1 on u2u.user = d1.user
-- join d as d2 on u2u.similar = d2.user
-- left join i2i on i2i.item = d1.item and i2i.similar = d2.item
-- where u2u.correlation > 0
-- and i2i.correlation > 0
-- and d2.item not in (select item from d as x where x.user = d1.user)
-- order by i2i.correlation desc
-- -- select * from d where user in (select similar from simusers)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment