Skip to content

Instantly share code, notes, and snippets.

@mackuba
Last active September 29, 2020 17:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mackuba/0dc2bc4db950f645dbbe29d8496c509c to your computer and use it in GitHub Desktop.
Save mackuba/0dc2bc4db950f645dbbe29d8496c509c to your computer and use it in GitHub Desktop.
Scripts for processing Nginx logs for Piwik
#!/usr/bin/env ruby
require 'optparse'
require 'set'
$buffer = []
$current_day = nil
$validated = Set.new
$invert = false
def dump_buffer
$buffer.each do |ip, line|
match = $validated.include?(ip)
if $invert
STDOUT.print(line) unless match
else
STDOUT.print(line) if match
end
end
end
OptionParser.new do |opts|
opts.on("-v", "--invert", "Print only bot logs instead of excluding them") do |v|
invert = true
end
end.parse!
STDIN.each_line do |line|
if line =~ /^([\d\.]+) .* \[([^\]]+)\] "([A-Z]+) (\S+) HTTP\/[\d\.]+" (\d\d\d)/
ip = $1
timestamp = $2
method = $3
url = $4
code = $5
day = timestamp.split(':').first
if day != $current_day
dump_buffer
$buffer.clear
$validated.clear
$current_day = day
end
if url.include?('.js') && code == '200'
$validated.add(ip)
end
$buffer << [ip, line]
end
end
dump_buffer
#!/bin/bash
TIMESTAMP_FOLDER="/var/lib/matomo/data/timestamps"
IMPORT_SCRIPT="/usr/share/matomo/misc/log-analytics/import_logs2.py"
CONSOLE_SCRIPT="/usr/share/matomo/console"
PAGE_TITLES_FILE=""
PIWIK_URL="https://example.com/piwik"
SITE_ID=0
PAYLOAD_SIZE=50
ARCHIVE=0
VERBOSE=0
OPTIONS=()
for AGENT in "+http" "@" "qt/"
do
OPTIONS+=("--useragent-exclude=$AGENT")
done
FORMAT_REGEX='(?P<ip>[\w*.:-]+)\s+\S+\s+(?P<userid>\S+)\s+'\
'\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+'\
'(?P<length>\S+)\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"\s*(?P<generation_time_secs>[.\d]*)'
print_usage()
{
echo "Usage: $0 -s <siteid> -p <page_titles|-> [-t <timestamp-name>] [-b <block_size>] [-a] [-v]"
exit 1
}
while [[ $# -gt 0 ]]; do
KEY="$1"
case $KEY in
-t)
if [ -n "$2" ]; then
OPTIONS+=("--timestamp-file=$TIMESTAMP_FOLDER/$2.time")
else
print_usage
fi
shift
shift
;;
-p)
if [ -n "$2" ]; then
PAGE_TITLES_FILE="$2"
else
print_usage
fi
shift
shift
;;
-s)
if [ -n "$2" ]; then
SITE_ID="$2"
else
print_usage
fi
shift
shift
;;
-b)
if [ -n "$2" ]; then
PAYLOAD_SIZE="$2"
else
print_usage
fi
shift
shift
;;
-a)
ARCHIVE=1
shift
;;
-v)
VERBOSE=1
shift
;;
*)
print_usage
;;
esac
done
if [ $SITE_ID -eq 0 ]; then
echo "Error: missing -s parameter"
exit 1
fi
if [ -z "$PAGE_TITLES_FILE" ]; then
echo "Error: missing -p parameter, pass '-' to skip"
exit 1
elif [ "$PAGE_TITLES_FILE" != "-" ]; then
OPTIONS+=("--page-titles-from=$PAGE_TITLES_FILE")
fi
if [ $VERBOSE -gt 0 ]; then
set -x
fi
CLEANED_FILE="/tmp/piwik-import-$SITE_ID.log"
cat | log-exclude-bots > $CLEANED_FILE
python "$IMPORT_SCRIPT" --idsite="$SITE_ID" \
--url="$PIWIK_URL" \
--recorder-max-payload-size="$PAYLOAD_SIZE" \
--log-format-regex=$FORMAT_REGEX \
--regex-group-to-page-cvar="user_agent=UserAgent" \
"${OPTIONS[@]}" \
"$CLEANED_FILE"
if [ $ARCHIVE -gt 0 ]; then
$CONSOLE_SCRIPT core:archive --force-idsites="$SITE_ID"
fi
---
- hosts: all
become: true
vars:
php_version: "7.2"
tasks:
- name: "Add the PHP repository to apt"
apt_repository:
repo: ppa:ondrej/php
- name: "Install PHP apt packages"
apt:
name:
- php{{ php_version }}-cli
- php{{ php_version }}-curl
- php{{ php_version }}-fpm
- php{{ php_version }}-gd
- php{{ php_version }}-mbstring
- php{{ php_version }}-mysql # TODO: php 7.3 does not provide php-mysql?
- php{{ php_version }}-xml
- php-mysql
- php-geoip
state: latest
update_cache: yes
cache_valid_time: 3600
install_recommends: no
- name: "Disable allow_url_fopen"
lineinfile:
dest: "/etc/php/{{ php_version }}/fpm/php.ini"
regexp: "allow_url_fopen ="
line: "allow_url_fopen = Off"
- name: "Load the Matomo apt repository key"
apt_key:
id: 1FD752571FE36FF23F78F91B81E2E78B66FED89E
url: http://debian.matomo.org/repository.gpg
- name: "Add the Matomo repository to apt"
apt_repository: repo="deb https://debian.matomo.org piwik main"
- name: "Install Matomo apt package"
apt:
name: matomo
state: latest
update_cache: yes
cache_valid_time: 3600
install_recommends: no
- name: "Enable automatic upgrades in debconf"
debconf:
name: matomo
question: matomo/automatic-upgrade
value: true
vtype: boolean
- name: "Create a database"
mysql_db:
name: piwik
- name: "Create a database user"
mysql_user:
user: piwik
host: localhost
priv: "piwik.*:SELECT,INSERT,UPDATE,DELETE,CREATE,DROP,ALTER,CREATE\ TEMPORARY\ TABLES,LOCK\ TABLES"
password: "{{ lookup('password', 'passwords/' + ansible_hostname + '/piwik-db-user length=15') }}"
# don't reload pointlessly every 5 seconds
- name: "Change live refresh setting"
lineinfile:
dest: "/etc/matomo/config.ini.php"
regexp: "^live_widget_refresh_after_seconds"
line: "live_widget_refresh_after_seconds = 60"
insertafter: "^\\[[Gg]eneral\\]"
- name: "Add the MaxMind repository for geoip tools"
apt_repository:
repo: ppa:maxmind/ppa
- name: "Install geoip packages"
apt:
name: geoipupdate
state: latest
update_cache: yes
cache_valid_time: 3600
install_recommends: no
- name: "Configure geoip to also use the ASN database"
lineinfile:
dest: /etc/GeoIP.conf
regexp: "^EditionIDs"
line: "EditionIDs GeoLite2-Country GeoLite2-City GeoLite2-ASN"
- name: "Update geoip database"
command: geoipupdate -d /usr/share/matomo/misc/
args:
creates: /usr/share/matomo/misc/GeoLite2-City.mmdb
- name: "Create a folder for log timestamps"
file:
path: "/var/lib/matomo/data/timestamps"
state: directory
owner: www-data
group: www-data
- name: "Upload custom scripts used for processing logs"
copy:
src: "files/{{ item }}"
dest: /usr/local/bin
mode: 0755
with_items:
- log-exclude-bots
- log-import
# from https://github.com/mackuba/matomo-log-analytics
- name: "Upload modified import_logs.py"
copy:
src: "files/import_logs.py"
dest: /usr/share/matomo/misc/log-analytics/import_logs2.py
mode: 0755
- name: "Set up a cron job for updating geoip"
cron:
user: root
name: "Update geoip database"
day: 10
hour: 3
minute: 30
job: "geoipupdate -d /usr/share/matomo/misc/"
- name: "Set up a cron job to run log analytics"
cron:
user: root
name: "Run Piwik log analytics"
minute: 1
job: "sudo -u www-data sh -c \"cat /var/log/nginx/blog-access.log | log-import -s 5 -p /var/www/blog/shared/page-titles.txt -t blog -a\" >> /var/log/matomo/log-import.log"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment