Created
February 25, 2017 19:28
-
-
Save theasder/f3a5d3a632b7c5288b67abee323ab1bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import datetime | |
import re | |
import string | |
import sqlite3 | |
from numpy import array, median | |
import vk | |
session = vk.Session() | |
api = vk.API(session) | |
class Post: | |
def __repr__(self): | |
return str((self.score, self.filtered_text, self.link, self.date)) | |
def query(self): | |
return self.filtered_text, self.link, self.score, self.date | |
def word_freq(self): | |
text = " ".join(self.text.split("<br>")) | |
text = re.sub(r"\s+", r' ', text) | |
text = re.sub(r"[«»\"]+", r'', text) | |
text = re.sub(re.compile('#[\wа-яА-ЯёЁ]+(\@\w+)?', re.MULTILINE), '', text) | |
self.filtered_text = text | |
def __init__(self, owner_id, id, post = None, members = -1): | |
self.owner_id = owner_id | |
self.id = id | |
self.likes = post['likes']['count'] | |
self.date = post['date'] | |
self.text = post['text'] | |
self.reposts = post['reposts']['count'] | |
self.comments = post['comments']['count'] | |
self.link = "https://vk.com/wall" + str(self.owner_id) + "_" + str(self.id) | |
self.virality = self.likes + 5 * self.reposts | |
self.score = 0 | |
self.word_freq() | |
class Page: | |
def __repr__(self): | |
s = "" | |
for post in self.posts: | |
s += str(post) + "\n" | |
return s | |
def get_posts(self, since_time, until_time): | |
times = int(self.number_of_posts / 100) + 1 | |
posts = [] | |
right_time = 0 | |
flag = False | |
for offset in range(0, times * 100, 100): | |
response = api.wall.get(owner_id=self.id, count=1000, offset=offset)[1:] | |
for post in response: | |
if post['date'] <= until_time and post['date'] >= since_time: | |
record = Post(self.id, post['id'], post, self.members) | |
posts.append(record) | |
if post['date'] < since_time: | |
flag = True | |
if flag: | |
break | |
return posts | |
def set_score(self): | |
virality = array([post.virality for post in self.posts]) | |
med = median(virality) | |
for post in self.posts: | |
post.score = post.virality / med | |
def __init__(self, id): # since_time = 0, until_time = 9223372036854775807): | |
if type(id) is str: | |
id = re.sub(r'(https://|vk.com)', r'', id) | |
response = api.groups.getById(group_ids=id) | |
self.id = - response[0]['gid'] | |
else: | |
self.id = id | |
response = api.groups.getMembers(group_id=-self.id, count=1) | |
self.members = response['count'] | |
response = api.wall.get(owner_id=self.id, count=1) | |
self.number_of_posts = response[0] | |
get_cur_time = lambda: int(round(time.time())) | |
cur_time = get_cur_time() | |
delta = datetime.timedelta(days = 2 * 366) | |
since_time = cur_time - delta.total_seconds() | |
until_time = cur_time | |
self.posts = self.get_posts(since_time, until_time) | |
self.set_score() | |
page = Page('iamdev') | |
queries = [post.query() for post in page.posts] | |
conn = sqlite3.connect('posts/my_posts.db') | |
c = conn.cursor() | |
# Create table | |
c.execute('''CREATE TABLE posts( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
text VARCHAR(500), | |
url VARCHAR(100), | |
virality FLOAT, | |
time BIGINT);''') | |
# # remove duplicates | |
# c.execute(""" | |
# DELETE FROM posts | |
# WHERE ID NOT IN (SELECT MAX(ID) | |
# FROM posts | |
# GROUP BY url | |
# HAVING MAX(ID) IS NOT NULL) | |
# """) | |
c.executemany('INSERT INTO posts (text, url, virality, time) VALUES (?,?,?,?)', queries) | |
# Save (commit) the changes | |
# conn.commit() | |
for row in c.execute('SELECT url FROM posts ORDER BY virality DESC LIMIT 20'): | |
print(row[0]) | |
# We can also close the connection if we are done with it. | |
# Just be sure any changes have been committed or they will be lost. | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment