sawidis/.gitignore

## .gitignore
messages.htm
messages.json

## readme.txt
Meta-psoling.

## Psoling.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Psoling.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## psoling.py
#!/usr/bin/env python

import re
from dateutil import parser as date_parser
import functools
from lxml import etree
import pandas as pd


class Processor(object):
    """Processor utilizes a pipeline of commands (Command objects) to process
    its input, one element at a time. An element goes through the pipeline and
    gets emitted before the processor starts processing the next one.

    >> processor = Processor(apache_log)
    >> processor.pipeline([
    >>     FilterIP(malicious_ip),
    >>     FilterByTime(after='02:34', before='04:52'),
    >>     GetURL()
    >> ])
    >> potential_targets = processor.process()
    >> for target in probable_targets:
    >>      # do sth
    """
    def __init__(self, inputs):
        """
        :param inputs: The input to be processes
        :type inputs: Any object that implements the Iterator protocol
        """
        self._inputs = inputs
        self._commands = []
        self._pipeline = None

    def process(self):
        """
        When iterating over it, yields the processed items one by one.
        """
        self._prepare()
        for output in self._pipeline:
            yield output

    def pipeline(self, commands):
        self._commands = commands

    def _prepare(self):
        self._pipeline = self._inputs
        for command in self._commands:
            self._pipeline = command.apply(self._pipeline)


class Command(object):

    def _apply(self, item):
        """Processes one item from the input and returns it (or not)"""
        raise NotImplemented

    def apply(self, items):
        for item in items:
            result = self._apply(item)
            if result is not None:
                yield result


class Grep(Command):

    def __init__(self, pattern, gettext=None, exclude=False):
        """
        :param pattern: pattern we are interested in
        :type pattern: string
        :param gettext: how to get the greppable text from each input item
                        By default, it assumes the item per se is greppable.
        :type gettext: lambda
        :param exclude: do we want to exclude input that matches the `pattern`?
        :type exclude: boolean
        """
        if exclude:
            pattern = '^((?!{pattern}).)*$'.format(pattern=pattern)
        self.regexp = re.compile(pattern)

        if gettext:
            self._get_text = gettext
        else:
            self._get_text = lambda item: item

    def _apply(self, item):
        text = self._get_text(item)
        if self.regexp.search(text):
            return item


class ExtractFbMessageNodes(Command):

    def apply(self, items):
        for item in items:
            messages = item.findall('.//div[@class="message"]')
            for message in messages:
                yield message

class BuildMessages(Command):

    def _apply(self, item):
        return Message(item)


class Message(object):

    def __init__(self, fb_message_div):
        self.user = self._get_user(fb_message_div)
        self.timestamp = self._get_timestamp(fb_message_div)
        self.message = self._get_message(fb_message_div)

    def _get_user(self, fb_message_div):
        return fb_message_div.find('.//span[@class="user"]').text

    def _get_timestamp(self, fb_message_div):
        str_date = fb_message_div.find('.//span[@class="meta"]').text
        return date_parser.parse(str_date)

    def _get_message(self, fb_message_div):
        return fb_message_div.getnext().text

    def __unicode__(self):
        return u'{timestamp} :: {user}'.format(
            timestamp=self.timestamp,
            user=self.user
        )

    def __str__(self):
        return unicode(self).encode('utf-8')

    def to_tuple(self):
        return (self.timestamp, self.user, self.message)


if __name__ == '__main__':
    print('Reading facebook messages.htm file..')
    with open("messages.htm") as f:
        contents = f.read()

    print('Filtering thread nodes..')
    parser = etree.XMLParser(recover=True)
    root = etree.fromstring(contents, parser=parser)
    threads = root.findall('.//div[@class="thread"]')

    print('Extracting psoling messages..')
    XMLNodeGrep = functools.partial(Grep, gettext=lambda item: item.text)

    processor = Processor(threads)
    processor.pipeline([
        XMLNodeGrep('Loukakos'),
        XMLNodeGrep('Tzortzis'),
        XMLNodeGrep('Stathis'),
        XMLNodeGrep('Stavropoulos'),
        XMLNodeGrep('Deltouzos'),
        XMLNodeGrep('Georgiadou', exclude=True),
        XMLNodeGrep('Poppins', exclude=True),
        XMLNodeGrep('Saradis', exclude=True),
        XMLNodeGrep('Ampatzidis', exclude=True),
        ExtractFbMessageNodes(),
        BuildMessages(),
    ])
    messages = list(processor.process())

    print("Sorting psoling messages by date..")
    messages.sort(key=lambda m: m.timestamp)

    print("Creating a dataframe..")
    dataset = pd.DataFrame(
        [m.to_tuple() for m in messages],
        columns=['timestamp', 'user', 'message']
    )

    print("Saving to json..")
    dataset.to_json('messages.json')
	#!/usr/bin/env python

	import re
	from dateutil import parser as date_parser
	import functools
	from lxml import etree
	import pandas as pd


	class Processor(object):
	"""Processor utilizes a pipeline of commands (Command objects) to process
	its input, one element at a time. An element goes through the pipeline and
	gets emitted before the processor starts processing the next one.

	>> processor = Processor(apache_log)
	>> processor.pipeline([
	>> FilterIP(malicious_ip),
	>> FilterByTime(after='02:34', before='04:52'),
	>> GetURL()
	>> ])
	>> potential_targets = processor.process()
	>> for target in probable_targets:
	>> # do sth
	"""
	def __init__(self, inputs):
	"""
	:param inputs: The input to be processes
	:type inputs: Any object that implements the Iterator protocol
	"""
	self._inputs = inputs
	self._commands = []
	self._pipeline = None

	def process(self):
	"""
	When iterating over it, yields the processed items one by one.
	"""
	self._prepare()
	for output in self._pipeline:
	yield output

	def pipeline(self, commands):
	self._commands = commands

	def _prepare(self):
	self._pipeline = self._inputs
	for command in self._commands:
	self._pipeline = command.apply(self._pipeline)


	class Command(object):

	def _apply(self, item):
	"""Processes one item from the input and returns it (or not)"""
	raise NotImplemented

	def apply(self, items):
	for item in items:
	result = self._apply(item)
	if result is not None:
	yield result


	class Grep(Command):

	def __init__(self, pattern, gettext=None, exclude=False):
	"""
	:param pattern: pattern we are interested in
	:type pattern: string
	:param gettext: how to get the greppable text from each input item
	By default, it assumes the item per se is greppable.
	:type gettext: lambda
	:param exclude: do we want to exclude input that matches the `pattern`?
	:type exclude: boolean
	"""
	if exclude:
	pattern = '^((?!{pattern}).)*$'.format(pattern=pattern)
	self.regexp = re.compile(pattern)

	if gettext:
	self._get_text = gettext
	else:
	self._get_text = lambda item: item

	def _apply(self, item):
	text = self._get_text(item)
	if self.regexp.search(text):
	return item


	class ExtractFbMessageNodes(Command):

	def apply(self, items):
	for item in items:
	messages = item.findall('.//div[@class="message"]')
	for message in messages:
	yield message

	class BuildMessages(Command):

	def _apply(self, item):
	return Message(item)


	class Message(object):

	def __init__(self, fb_message_div):
	self.user = self._get_user(fb_message_div)
	self.timestamp = self._get_timestamp(fb_message_div)
	self.message = self._get_message(fb_message_div)

	def _get_user(self, fb_message_div):
	return fb_message_div.find('.//span[@class="user"]').text

	def _get_timestamp(self, fb_message_div):
	str_date = fb_message_div.find('.//span[@class="meta"]').text
	return date_parser.parse(str_date)

	def _get_message(self, fb_message_div):
	return fb_message_div.getnext().text

	def __unicode__(self):
	return u'{timestamp} :: {user}'.format(
	timestamp=self.timestamp,
	user=self.user
	)

	def __str__(self):
	return unicode(self).encode('utf-8')

	def to_tuple(self):
	return (self.timestamp, self.user, self.message)


	if __name__ == '__main__':
	print('Reading facebook messages.htm file..')
	with open("messages.htm") as f:
	contents = f.read()

	print('Filtering thread nodes..')
	parser = etree.XMLParser(recover=True)
	root = etree.fromstring(contents, parser=parser)
	threads = root.findall('.//div[@class="thread"]')

	print('Extracting psoling messages..')
	XMLNodeGrep = functools.partial(Grep, gettext=lambda item: item.text)

	processor = Processor(threads)
	processor.pipeline([
	XMLNodeGrep('Loukakos'),
	XMLNodeGrep('Tzortzis'),
	XMLNodeGrep('Stathis'),
	XMLNodeGrep('Stavropoulos'),
	XMLNodeGrep('Deltouzos'),
	XMLNodeGrep('Georgiadou', exclude=True),
	XMLNodeGrep('Poppins', exclude=True),
	XMLNodeGrep('Saradis', exclude=True),
	XMLNodeGrep('Ampatzidis', exclude=True),
	ExtractFbMessageNodes(),
	BuildMessages(),
	])
	messages = list(processor.process())

	print("Sorting psoling messages by date..")
	messages.sort(key=lambda m: m.timestamp)

	print("Creating a dataframe..")
	dataset = pd.DataFrame(
	[m.to_tuple() for m in messages],
	columns=['timestamp', 'user', 'message']
	)

	print("Saving to json..")
	dataset.to_json('messages.json')