Skip to content

Instantly share code, notes, and snippets.

@sawidis
Last active September 14, 2015 15:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sawidis/445bfd39783557052d45 to your computer and use it in GitHub Desktop.
Save sawidis/445bfd39783557052d45 to your computer and use it in GitHub Desktop.
Psoling
messages.htm
messages.json
Meta-psoling.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#!/usr/bin/env python
import re
from dateutil import parser as date_parser
import functools
from lxml import etree
import pandas as pd
class Processor(object):
"""Processor utilizes a pipeline of commands (Command objects) to process
its input, one element at a time. An element goes through the pipeline and
gets emitted before the processor starts processing the next one.
>> processor = Processor(apache_log)
>> processor.pipeline([
>> FilterIP(malicious_ip),
>> FilterByTime(after='02:34', before='04:52'),
>> GetURL()
>> ])
>> potential_targets = processor.process()
>> for target in probable_targets:
>> # do sth
"""
def __init__(self, inputs):
"""
:param inputs: The input to be processes
:type inputs: Any object that implements the Iterator protocol
"""
self._inputs = inputs
self._commands = []
self._pipeline = None
def process(self):
"""
When iterating over it, yields the processed items one by one.
"""
self._prepare()
for output in self._pipeline:
yield output
def pipeline(self, commands):
self._commands = commands
def _prepare(self):
self._pipeline = self._inputs
for command in self._commands:
self._pipeline = command.apply(self._pipeline)
class Command(object):
def _apply(self, item):
"""Processes one item from the input and returns it (or not)"""
raise NotImplemented
def apply(self, items):
for item in items:
result = self._apply(item)
if result is not None:
yield result
class Grep(Command):
def __init__(self, pattern, gettext=None, exclude=False):
"""
:param pattern: pattern we are interested in
:type pattern: string
:param gettext: how to get the greppable text from each input item
By default, it assumes the item per se is greppable.
:type gettext: lambda
:param exclude: do we want to exclude input that matches the `pattern`?
:type exclude: boolean
"""
if exclude:
pattern = '^((?!{pattern}).)*$'.format(pattern=pattern)
self.regexp = re.compile(pattern)
if gettext:
self._get_text = gettext
else:
self._get_text = lambda item: item
def _apply(self, item):
text = self._get_text(item)
if self.regexp.search(text):
return item
class ExtractFbMessageNodes(Command):
def apply(self, items):
for item in items:
messages = item.findall('.//div[@class="message"]')
for message in messages:
yield message
class BuildMessages(Command):
def _apply(self, item):
return Message(item)
class Message(object):
def __init__(self, fb_message_div):
self.user = self._get_user(fb_message_div)
self.timestamp = self._get_timestamp(fb_message_div)
self.message = self._get_message(fb_message_div)
def _get_user(self, fb_message_div):
return fb_message_div.find('.//span[@class="user"]').text
def _get_timestamp(self, fb_message_div):
str_date = fb_message_div.find('.//span[@class="meta"]').text
return date_parser.parse(str_date)
def _get_message(self, fb_message_div):
return fb_message_div.getnext().text
def __unicode__(self):
return u'{timestamp} :: {user}'.format(
timestamp=self.timestamp,
user=self.user
)
def __str__(self):
return unicode(self).encode('utf-8')
def to_tuple(self):
return (self.timestamp, self.user, self.message)
if __name__ == '__main__':
print('Reading facebook messages.htm file..')
with open("messages.htm") as f:
contents = f.read()
print('Filtering thread nodes..')
parser = etree.XMLParser(recover=True)
root = etree.fromstring(contents, parser=parser)
threads = root.findall('.//div[@class="thread"]')
print('Extracting psoling messages..')
XMLNodeGrep = functools.partial(Grep, gettext=lambda item: item.text)
processor = Processor(threads)
processor.pipeline([
XMLNodeGrep('Loukakos'),
XMLNodeGrep('Tzortzis'),
XMLNodeGrep('Stathis'),
XMLNodeGrep('Stavropoulos'),
XMLNodeGrep('Deltouzos'),
XMLNodeGrep('Georgiadou', exclude=True),
XMLNodeGrep('Poppins', exclude=True),
XMLNodeGrep('Saradis', exclude=True),
XMLNodeGrep('Ampatzidis', exclude=True),
ExtractFbMessageNodes(),
BuildMessages(),
])
messages = list(processor.process())
print("Sorting psoling messages by date..")
messages.sort(key=lambda m: m.timestamp)
print("Creating a dataframe..")
dataset = pd.DataFrame(
[m.to_tuple() for m in messages],
columns=['timestamp', 'user', 'message']
)
print("Saving to json..")
dataset.to_json('messages.json')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment