Last active
September 29, 2020 17:40
-
-
Save mackuba/0dc2bc4db950f645dbbe29d8496c509c to your computer and use it in GitHub Desktop.
Scripts for processing Nginx logs for Piwik
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'optparse' | |
require 'set' | |
$buffer = [] | |
$current_day = nil | |
$validated = Set.new | |
$invert = false | |
def dump_buffer | |
$buffer.each do |ip, line| | |
match = $validated.include?(ip) | |
if $invert | |
STDOUT.print(line) unless match | |
else | |
STDOUT.print(line) if match | |
end | |
end | |
end | |
OptionParser.new do |opts| | |
opts.on("-v", "--invert", "Print only bot logs instead of excluding them") do |v| | |
invert = true | |
end | |
end.parse! | |
STDIN.each_line do |line| | |
if line =~ /^([\d\.]+) .* \[([^\]]+)\] "([A-Z]+) (\S+) HTTP\/[\d\.]+" (\d\d\d)/ | |
ip = $1 | |
timestamp = $2 | |
method = $3 | |
url = $4 | |
code = $5 | |
day = timestamp.split(':').first | |
if day != $current_day | |
dump_buffer | |
$buffer.clear | |
$validated.clear | |
$current_day = day | |
end | |
if url.include?('.js') && code == '200' | |
$validated.add(ip) | |
end | |
$buffer << [ip, line] | |
end | |
end | |
dump_buffer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
TIMESTAMP_FOLDER="/var/lib/matomo/data/timestamps" | |
IMPORT_SCRIPT="/usr/share/matomo/misc/log-analytics/import_logs2.py" | |
CONSOLE_SCRIPT="/usr/share/matomo/console" | |
PAGE_TITLES_FILE="" | |
PIWIK_URL="https://example.com/piwik" | |
SITE_ID=0 | |
PAYLOAD_SIZE=50 | |
ARCHIVE=0 | |
VERBOSE=0 | |
OPTIONS=() | |
for AGENT in "+http" "@" "qt/" | |
do | |
OPTIONS+=("--useragent-exclude=$AGENT") | |
done | |
FORMAT_REGEX='(?P<ip>[\w*.:-]+)\s+\S+\s+(?P<userid>\S+)\s+'\ | |
'\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+"(?P<method>\S+)\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\d+)\s+'\ | |
'(?P<length>\S+)\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"\s*(?P<generation_time_secs>[.\d]*)' | |
print_usage() | |
{ | |
echo "Usage: $0 -s <siteid> -p <page_titles|-> [-t <timestamp-name>] [-b <block_size>] [-a] [-v]" | |
exit 1 | |
} | |
while [[ $# -gt 0 ]]; do | |
KEY="$1" | |
case $KEY in | |
-t) | |
if [ -n "$2" ]; then | |
OPTIONS+=("--timestamp-file=$TIMESTAMP_FOLDER/$2.time") | |
else | |
print_usage | |
fi | |
shift | |
shift | |
;; | |
-p) | |
if [ -n "$2" ]; then | |
PAGE_TITLES_FILE="$2" | |
else | |
print_usage | |
fi | |
shift | |
shift | |
;; | |
-s) | |
if [ -n "$2" ]; then | |
SITE_ID="$2" | |
else | |
print_usage | |
fi | |
shift | |
shift | |
;; | |
-b) | |
if [ -n "$2" ]; then | |
PAYLOAD_SIZE="$2" | |
else | |
print_usage | |
fi | |
shift | |
shift | |
;; | |
-a) | |
ARCHIVE=1 | |
shift | |
;; | |
-v) | |
VERBOSE=1 | |
shift | |
;; | |
*) | |
print_usage | |
;; | |
esac | |
done | |
if [ $SITE_ID -eq 0 ]; then | |
echo "Error: missing -s parameter" | |
exit 1 | |
fi | |
if [ -z "$PAGE_TITLES_FILE" ]; then | |
echo "Error: missing -p parameter, pass '-' to skip" | |
exit 1 | |
elif [ "$PAGE_TITLES_FILE" != "-" ]; then | |
OPTIONS+=("--page-titles-from=$PAGE_TITLES_FILE") | |
fi | |
if [ $VERBOSE -gt 0 ]; then | |
set -x | |
fi | |
CLEANED_FILE="/tmp/piwik-import-$SITE_ID.log" | |
cat | log-exclude-bots > $CLEANED_FILE | |
python "$IMPORT_SCRIPT" --idsite="$SITE_ID" \ | |
--url="$PIWIK_URL" \ | |
--recorder-max-payload-size="$PAYLOAD_SIZE" \ | |
--log-format-regex=$FORMAT_REGEX \ | |
--regex-group-to-page-cvar="user_agent=UserAgent" \ | |
"${OPTIONS[@]}" \ | |
"$CLEANED_FILE" | |
if [ $ARCHIVE -gt 0 ]; then | |
$CONSOLE_SCRIPT core:archive --force-idsites="$SITE_ID" | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
- hosts: all | |
become: true | |
vars: | |
php_version: "7.2" | |
tasks: | |
- name: "Add the PHP repository to apt" | |
apt_repository: | |
repo: ppa:ondrej/php | |
- name: "Install PHP apt packages" | |
apt: | |
name: | |
- php{{ php_version }}-cli | |
- php{{ php_version }}-curl | |
- php{{ php_version }}-fpm | |
- php{{ php_version }}-gd | |
- php{{ php_version }}-mbstring | |
- php{{ php_version }}-mysql # TODO: php 7.3 does not provide php-mysql? | |
- php{{ php_version }}-xml | |
- php-mysql | |
- php-geoip | |
state: latest | |
update_cache: yes | |
cache_valid_time: 3600 | |
install_recommends: no | |
- name: "Disable allow_url_fopen" | |
lineinfile: | |
dest: "/etc/php/{{ php_version }}/fpm/php.ini" | |
regexp: "allow_url_fopen =" | |
line: "allow_url_fopen = Off" | |
- name: "Load the Matomo apt repository key" | |
apt_key: | |
id: 1FD752571FE36FF23F78F91B81E2E78B66FED89E | |
url: http://debian.matomo.org/repository.gpg | |
- name: "Add the Matomo repository to apt" | |
apt_repository: repo="deb https://debian.matomo.org piwik main" | |
- name: "Install Matomo apt package" | |
apt: | |
name: matomo | |
state: latest | |
update_cache: yes | |
cache_valid_time: 3600 | |
install_recommends: no | |
- name: "Enable automatic upgrades in debconf" | |
debconf: | |
name: matomo | |
question: matomo/automatic-upgrade | |
value: true | |
vtype: boolean | |
- name: "Create a database" | |
mysql_db: | |
name: piwik | |
- name: "Create a database user" | |
mysql_user: | |
user: piwik | |
host: localhost | |
priv: "piwik.*:SELECT,INSERT,UPDATE,DELETE,CREATE,DROP,ALTER,CREATE\ TEMPORARY\ TABLES,LOCK\ TABLES" | |
password: "{{ lookup('password', 'passwords/' + ansible_hostname + '/piwik-db-user length=15') }}" | |
# don't reload pointlessly every 5 seconds | |
- name: "Change live refresh setting" | |
lineinfile: | |
dest: "/etc/matomo/config.ini.php" | |
regexp: "^live_widget_refresh_after_seconds" | |
line: "live_widget_refresh_after_seconds = 60" | |
insertafter: "^\\[[Gg]eneral\\]" | |
- name: "Add the MaxMind repository for geoip tools" | |
apt_repository: | |
repo: ppa:maxmind/ppa | |
- name: "Install geoip packages" | |
apt: | |
name: geoipupdate | |
state: latest | |
update_cache: yes | |
cache_valid_time: 3600 | |
install_recommends: no | |
- name: "Configure geoip to also use the ASN database" | |
lineinfile: | |
dest: /etc/GeoIP.conf | |
regexp: "^EditionIDs" | |
line: "EditionIDs GeoLite2-Country GeoLite2-City GeoLite2-ASN" | |
- name: "Update geoip database" | |
command: geoipupdate -d /usr/share/matomo/misc/ | |
args: | |
creates: /usr/share/matomo/misc/GeoLite2-City.mmdb | |
- name: "Create a folder for log timestamps" | |
file: | |
path: "/var/lib/matomo/data/timestamps" | |
state: directory | |
owner: www-data | |
group: www-data | |
- name: "Upload custom scripts used for processing logs" | |
copy: | |
src: "files/{{ item }}" | |
dest: /usr/local/bin | |
mode: 0755 | |
with_items: | |
- log-exclude-bots | |
- log-import | |
# from https://github.com/mackuba/matomo-log-analytics | |
- name: "Upload modified import_logs.py" | |
copy: | |
src: "files/import_logs.py" | |
dest: /usr/share/matomo/misc/log-analytics/import_logs2.py | |
mode: 0755 | |
- name: "Set up a cron job for updating geoip" | |
cron: | |
user: root | |
name: "Update geoip database" | |
day: 10 | |
hour: 3 | |
minute: 30 | |
job: "geoipupdate -d /usr/share/matomo/misc/" | |
- name: "Set up a cron job to run log analytics" | |
cron: | |
user: root | |
name: "Run Piwik log analytics" | |
minute: 1 | |
job: "sudo -u www-data sh -c \"cat /var/log/nginx/blog-access.log | log-import -s 5 -p /var/www/blog/shared/page-titles.txt -t blog -a\" >> /var/log/matomo/log-import.log" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment