|
#!/bin/bash |
|
# Copyright (c) 2012 Yu-Jie Lin |
|
# |
|
# Permission is hereby granted, free of charge, to any person obtaining a copy of |
|
# this software and associated documentation files (the "Software"), to deal in |
|
# the Software without restriction, including without limitation the rights to |
|
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies |
|
# of the Software, and to permit persons to whom the Software is furnished to do |
|
# so, subject to the following conditions: |
|
# |
|
# The above copyright notice and this permission notice shall be included in all |
|
# copies or substantial portions of the Software. |
|
# |
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
# SOFTWARE. |
|
# |
|
# Gist: https://gist.github.com/2007998 |
|
# Blog: http://blog.yjl.im/2012/03/checking-link-ins-with-google-webmaster.html |
|
|
|
|
|
[[ -f "wtal.re.sh" ]] && source "wtal.re.sh" |
|
|
|
REGEX_EXCLUDES=( |
|
"${REGEX_EXCLUDES[@]}" |
|
|
|
'^Links$' |
|
|
|
'https://bugs.launchpad.net/[^/]+/\+(bug|source)/.+' |
|
'https://gist.github.com/[^/]+/.+' |
|
|
|
'http://.*\.blogspot\.com/...._.._.._archive\.html' |
|
'http://([^.]*.)?technorati.com/' |
|
'https?://.*\.wordpress\.com/..../(../(../)?)?$' |
|
|
|
'/(archive|author|category|directory|feeds?|page|tag(ged|s?)?)/' |
|
|
|
'?(page|tag)=' |
|
'?(format|output|type)=(atom|rss)' |
|
'&view=print' |
|
) |
|
|
|
|
|
for FILE_CSV in "$@"; do |
|
FILE_BASE="${FILE_CSV%_ExternalLinks_AllLinks.csv}" |
|
# only the domain name |
|
FILE_MAIN="${FILE_BASE%_*}" |
|
FILE_TS="${FILE_BASE##*_}" |
|
|
|
FILE_LINKS="${FILE_MAIN}.links" |
|
FILE_CHECKED="${FILE_MAIN}.checked" |
|
FILE_CSV_LINKS="${FILE_CSV}.links" |
|
|
|
touch "$FILE_LINKS" "$FILE_CHECKED" |
|
|
|
echo -n "$FILE_CSV... " |
|
# Don't process this CSV, it has been processed before |
|
grep --max-count=1 "$FILE_TS" "$FILE_CHECKED" &>/dev/null && echo already checked && continue |
|
|
|
# Work with both Sample Links and Latest Links |
|
CSV="$(sed '1d;s/,....-..-..$//' "$FILE_CSV")" |
|
for RE in "${REGEX_EXCLUDES[@]}"; do |
|
CSV="$(echo "$CSV" | egrep -v "$RE")" |
|
done |
|
|
|
echo "$CSV" | sort > "$FILE_CSV_LINKS" |
|
NEW_LINKS="$(cut -d ' ' -f 2 "$FILE_LINKS" | sort | diff - "$FILE_CSV_LINKS" | grep '>' | sed "s/>/$FILE_TS/")" |
|
|
|
if [[ -z "$NEW_LINKS" ]]; then |
|
echo no new links |
|
else |
|
echo "$(echo "$NEW_LINKS" | tee -a "$FILE_LINKS" | wc -l) new links" |
|
echo |
|
echo " grep $FILE_TS \"$FILE_LINKS\" | cut -d ' ' -f 2" |
|
echo |
|
fi |
|
echo "$FILE_TS" >> "$FILE_CHECKED" |
|
done |
Blog post about how to use this script: http://blog.yjl.im/2012/03/checking-link-ins-with-google-webmaster.html