Skip to content

Instantly share code, notes, and snippets.

@tvwerkhoven
Last active November 8, 2017 01:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tvwerkhoven/6956681fff9100875bb9c08b83cdf55e to your computer and use it in GitHub Desktop.
Save tvwerkhoven/6956681fff9100875bb9c08b83cdf55e to your computer and use it in GitHub Desktop.
Whatsapp chat log parser for pisg
# Whatsapp log parser for pisg, made by Tim van Werkhoven
# Features: parses logs, detects subject changes
# Known issues: only works without images, does not filter out 'image omitted' texts
# Use Charset = "utf-8" in config file to enable emoji
package Pisg::Parser::Format::whatsapp;
use strict;
$^W = 1;
sub new
{
my ($type, %args) = @_;
my $self = {
cfg => $args{cfg},
# Tested at http://www.regexplanet.com/advanced/perl/index.html
# Example line
# 27/11/13 21:40:56: Tim: Zo, even whatsapp geleegd
# 27/11/2013, 21:40:55: Timmeh: Zo, even whatsapp geleegd
normalline => '^\d+\/\d+\/\d+,\s(\d+):\d+\:\d+\:\s+([^:]+)\: (.+)',
actionline => '^NA',
thirdline => '^\d+\/\d+\/\d+\s(\d+):(\d+)\:\d+\:\s+(.+)',
};
#$self->{cfg}->{botnicks} .= ' Hub-Security';
bless($self, $type);
return $self;
}
# Parse a normal line - returns a hash with 'hour', 'nick' and 'saying'
sub normalline
{
my ($self, $line, $lines) = @_;
my %hash;
if ($line =~ /$self->{normalline}/o) {
# Most log formats are regular enough that you can just match the
# appropriate things with parentheses in the regular expression.
$hash{hour} = $1;
$hash{nick} = $2;
$hash{saying} = $3;
# Fix <image ommitted>, replace with <picture> (single word that can be logged and tallied)
if ($3 =~ ".*image omitted.*") {
$hash{saying} = "<picture>";
}
if ($self->{cfg}->{botnicks} =~ /\b\Q$hash{nick}\E\b/) {
return;
}
return \%hash;
} else {
return;
}
}
# Parse an action line - returns a hash with 'hour', 'nick' and 'saying'
sub actionline
{
my ($self, $line, $lines) = @_;
my %hash;
if ($line =~ /$self->{actionline}/o) {
# Most log formats are regular enough that you can just match the
# appropriate things with parentheses in the regular expression.
$hash{hour} = $1;
$hash{nick} = $2;
$hash{saying} = $3;
return \%hash;
} else {
return;
}
}
# Parses the 'third' line - (the third line is everything else, like
# topic changes, mode changes, kicks, etc.)
# thirdline() has to return a hash with the following keys, for
# every format:
# hour - the hour we're in (for timestamp logging)
# min - the minute we're in (for timestamp logging)
# nick - the nick
# kicker - the nick which kicked somebody (if any)
# newtopic - the new topic (if any)
# newmode - deops or ops, must be '+o' or '-o', or '+ooo'
# newjoin - a new nick which has joined the channel
# newnick - a person has changed nick and this is the new nick
#
# It should return a hash with the following (for formatting lines in html)
#
# kicktext - the kick reason (if any)
# modechanges - data of the mode change ('Nick' in '+o Nick')
#
# The hash may also have a "repeated" key indicating the number of times
# the line was repeated. (Used by eggdrops log for example.)
sub thirdline
{
my ($self, $line, $lines) = @_;
my %hash;
if ($line =~ /$self->{thirdline}/o) {
$hash{hour} = $1;
$hash{min} = $2;
$hash{nick} = $3;
# Example line: Tim changed the subject to “🎄🎇#!test🎇🎄”
# Format-specific stuff goes here.
if ($3 =~ /^(.*?) changed the subject to (.+)/) {
$hash{nick} = $1;
$hash{newtopic} = $2;
}
return \%hash;
} else {
return;
}
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment