Last active
September 14, 2015 15:24
-
-
Save sawidis/445bfd39783557052d45 to your computer and use it in GitHub Desktop.
Psoling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
messages.htm | |
messages.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Meta-psoling. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
from dateutil import parser as date_parser | |
import functools | |
from lxml import etree | |
import pandas as pd | |
class Processor(object): | |
"""Processor utilizes a pipeline of commands (Command objects) to process | |
its input, one element at a time. An element goes through the pipeline and | |
gets emitted before the processor starts processing the next one. | |
>> processor = Processor(apache_log) | |
>> processor.pipeline([ | |
>> FilterIP(malicious_ip), | |
>> FilterByTime(after='02:34', before='04:52'), | |
>> GetURL() | |
>> ]) | |
>> potential_targets = processor.process() | |
>> for target in probable_targets: | |
>> # do sth | |
""" | |
def __init__(self, inputs): | |
""" | |
:param inputs: The input to be processes | |
:type inputs: Any object that implements the Iterator protocol | |
""" | |
self._inputs = inputs | |
self._commands = [] | |
self._pipeline = None | |
def process(self): | |
""" | |
When iterating over it, yields the processed items one by one. | |
""" | |
self._prepare() | |
for output in self._pipeline: | |
yield output | |
def pipeline(self, commands): | |
self._commands = commands | |
def _prepare(self): | |
self._pipeline = self._inputs | |
for command in self._commands: | |
self._pipeline = command.apply(self._pipeline) | |
class Command(object): | |
def _apply(self, item): | |
"""Processes one item from the input and returns it (or not)""" | |
raise NotImplemented | |
def apply(self, items): | |
for item in items: | |
result = self._apply(item) | |
if result is not None: | |
yield result | |
class Grep(Command): | |
def __init__(self, pattern, gettext=None, exclude=False): | |
""" | |
:param pattern: pattern we are interested in | |
:type pattern: string | |
:param gettext: how to get the greppable text from each input item | |
By default, it assumes the item per se is greppable. | |
:type gettext: lambda | |
:param exclude: do we want to exclude input that matches the `pattern`? | |
:type exclude: boolean | |
""" | |
if exclude: | |
pattern = '^((?!{pattern}).)*$'.format(pattern=pattern) | |
self.regexp = re.compile(pattern) | |
if gettext: | |
self._get_text = gettext | |
else: | |
self._get_text = lambda item: item | |
def _apply(self, item): | |
text = self._get_text(item) | |
if self.regexp.search(text): | |
return item | |
class ExtractFbMessageNodes(Command): | |
def apply(self, items): | |
for item in items: | |
messages = item.findall('.//div[@class="message"]') | |
for message in messages: | |
yield message | |
class BuildMessages(Command): | |
def _apply(self, item): | |
return Message(item) | |
class Message(object): | |
def __init__(self, fb_message_div): | |
self.user = self._get_user(fb_message_div) | |
self.timestamp = self._get_timestamp(fb_message_div) | |
self.message = self._get_message(fb_message_div) | |
def _get_user(self, fb_message_div): | |
return fb_message_div.find('.//span[@class="user"]').text | |
def _get_timestamp(self, fb_message_div): | |
str_date = fb_message_div.find('.//span[@class="meta"]').text | |
return date_parser.parse(str_date) | |
def _get_message(self, fb_message_div): | |
return fb_message_div.getnext().text | |
def __unicode__(self): | |
return u'{timestamp} :: {user}'.format( | |
timestamp=self.timestamp, | |
user=self.user | |
) | |
def __str__(self): | |
return unicode(self).encode('utf-8') | |
def to_tuple(self): | |
return (self.timestamp, self.user, self.message) | |
if __name__ == '__main__': | |
print('Reading facebook messages.htm file..') | |
with open("messages.htm") as f: | |
contents = f.read() | |
print('Filtering thread nodes..') | |
parser = etree.XMLParser(recover=True) | |
root = etree.fromstring(contents, parser=parser) | |
threads = root.findall('.//div[@class="thread"]') | |
print('Extracting psoling messages..') | |
XMLNodeGrep = functools.partial(Grep, gettext=lambda item: item.text) | |
processor = Processor(threads) | |
processor.pipeline([ | |
XMLNodeGrep('Loukakos'), | |
XMLNodeGrep('Tzortzis'), | |
XMLNodeGrep('Stathis'), | |
XMLNodeGrep('Stavropoulos'), | |
XMLNodeGrep('Deltouzos'), | |
XMLNodeGrep('Georgiadou', exclude=True), | |
XMLNodeGrep('Poppins', exclude=True), | |
XMLNodeGrep('Saradis', exclude=True), | |
XMLNodeGrep('Ampatzidis', exclude=True), | |
ExtractFbMessageNodes(), | |
BuildMessages(), | |
]) | |
messages = list(processor.process()) | |
print("Sorting psoling messages by date..") | |
messages.sort(key=lambda m: m.timestamp) | |
print("Creating a dataframe..") | |
dataset = pd.DataFrame( | |
[m.to_tuple() for m in messages], | |
columns=['timestamp', 'user', 'message'] | |
) | |
print("Saving to json..") | |
dataset.to_json('messages.json') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment