Skip to content

Instantly share code, notes, and snippets.

@alex-hse-repository
Last active June 3, 2022 08:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alex-hse-repository/fa667705921eb0383f69cbee34e51173 to your computer and use it in GitHub Desktop.
Save alex-hse-repository/fa667705921eb0383f69cbee34e51173 to your computer and use it in GitHub Desktop.
from etna.datasets import TSDataset
from itertools import filterfalse
import pandas as pd
import re
def get_segments(tags: list):
segments = []
for tag in tags:
segments.extend(list(filterfalse(lambda x: tag not in x, data["Page"])))
return segments
def get_ts(tags: list):
dfs = []
segments = get_segments(tags)
sample = data[data["Page"].isin(segments)]
for i in sample.index:
df = pd.DataFrame()
segment_slice = sample.loc[i]
df["timestamp"] = segment_slice[1:-3].index.values
df["target"] = segment_slice[1:-3].astype(float).values
df["segment"] = segment_slice["Page"]
dfs.append(df)
df = TSDataset.to_dataset(pd.concat(dfs))
ts = TSDataset(df=df, freq="D")
return ts
def get_language(page):
res = re.search("[a-z][a-z].wikipedia.org", page)
return res[0][0:2] if res else "na"
def get_agent(page):
return "spider" if "_spider" in page else "all-agents"
def get_access(page):
access = "na"
if "_desktop" in page:
access = "desktop"
elif "_mobile-web" in page:
access = "mobile-web"
elif "_all-access" in page:
access = "all-access"
return access
data = pd.read_csv("wikipedia.csv")
data["language"] = data["Page"].apply(get_language)
data["agent"] = data["Page"].apply(get_agent)
data["access"] = data["Page"].apply(get_access)
data = data[(data["language"].isin(["en", "ru"])) & (data["agent"] == "all-agents") & (data["access"] == "all-access")]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment