Skip to content

Instantly share code, notes, and snippets.

Last active September 14, 2015 15:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sawidis/445bfd39783557052d45 to your computer and use it in GitHub Desktop.
Save sawidis/445bfd39783557052d45 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#!/usr/bin/env python
import re
from dateutil import parser as date_parser
import functools
from lxml import etree
import pandas as pd
class Processor(object):
"""Processor utilizes a pipeline of commands (Command objects) to process
its input, one element at a time. An element goes through the pipeline and
gets emitted before the processor starts processing the next one.
>> processor = Processor(apache_log)
>> processor.pipeline([
>> FilterIP(malicious_ip),
>> FilterByTime(after='02:34', before='04:52'),
>> GetURL()
>> ])
>> potential_targets = processor.process()
>> for target in probable_targets:
>> # do sth
def __init__(self, inputs):
:param inputs: The input to be processes
:type inputs: Any object that implements the Iterator protocol
self._inputs = inputs
self._commands = []
self._pipeline = None
def process(self):
When iterating over it, yields the processed items one by one.
for output in self._pipeline:
yield output
def pipeline(self, commands):
self._commands = commands
def _prepare(self):
self._pipeline = self._inputs
for command in self._commands:
self._pipeline = command.apply(self._pipeline)
class Command(object):
def _apply(self, item):
"""Processes one item from the input and returns it (or not)"""
raise NotImplemented
def apply(self, items):
for item in items:
result = self._apply(item)
if result is not None:
yield result
class Grep(Command):
def __init__(self, pattern, gettext=None, exclude=False):
:param pattern: pattern we are interested in
:type pattern: string
:param gettext: how to get the greppable text from each input item
By default, it assumes the item per se is greppable.
:type gettext: lambda
:param exclude: do we want to exclude input that matches the `pattern`?
:type exclude: boolean
if exclude:
pattern = '^((?!{pattern}).)*$'.format(pattern=pattern)
self.regexp = re.compile(pattern)
if gettext:
self._get_text = gettext
self._get_text = lambda item: item
def _apply(self, item):
text = self._get_text(item)
return item
class ExtractFbMessageNodes(Command):
def apply(self, items):
for item in items:
messages = item.findall('.//div[@class="message"]')
for message in messages:
yield message
class BuildMessages(Command):
def _apply(self, item):
return Message(item)
class Message(object):
def __init__(self, fb_message_div):
self.user = self._get_user(fb_message_div)
self.timestamp = self._get_timestamp(fb_message_div)
self.message = self._get_message(fb_message_div)
def _get_user(self, fb_message_div):
return fb_message_div.find('.//span[@class="user"]').text
def _get_timestamp(self, fb_message_div):
str_date = fb_message_div.find('.//span[@class="meta"]').text
return date_parser.parse(str_date)
def _get_message(self, fb_message_div):
return fb_message_div.getnext().text
def __unicode__(self):
return u'{timestamp} :: {user}'.format(
def __str__(self):
return unicode(self).encode('utf-8')
def to_tuple(self):
return (self.timestamp, self.user, self.message)
if __name__ == '__main__':
print('Reading facebook messages.htm file..')
with open("messages.htm") as f:
contents =
print('Filtering thread nodes..')
parser = etree.XMLParser(recover=True)
root = etree.fromstring(contents, parser=parser)
threads = root.findall('.//div[@class="thread"]')
print('Extracting psoling messages..')
XMLNodeGrep = functools.partial(Grep, gettext=lambda item: item.text)
processor = Processor(threads)
XMLNodeGrep('Georgiadou', exclude=True),
XMLNodeGrep('Poppins', exclude=True),
XMLNodeGrep('Saradis', exclude=True),
XMLNodeGrep('Ampatzidis', exclude=True),
messages = list(processor.process())
print("Sorting psoling messages by date..")
messages.sort(key=lambda m: m.timestamp)
print("Creating a dataframe..")
dataset = pd.DataFrame(
[m.to_tuple() for m in messages],
columns=['timestamp', 'user', 'message']
print("Saving to json..")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment