Skip to content

Instantly share code, notes, and snippets.

Created April 25, 2013 21:03
Show Gist options
  • Save bwbaugh/5463151 to your computer and use it in GitHub Desktop.
Save bwbaugh/5463151 to your computer and use it in GitHub Desktop.
Detecting a Specific Watermark in a Photo with Python Get example training and testing images here: <> Stack Overflow question: <>
# Copyright (C) 2013 Wesley Baugh
"""Tools for text classification.
Extracted from the [infer]( library.
from __future__ import division
import math
from collections import defaultdict, namedtuple, Counter
from fractions import Fraction
class MultinomialNB(object):
"""Multinomial Naive Bayes for text classification.
exact: Boolean indicating if exact probabilities should be
returned as a `Fraction`. Otherwise, speed up computations
but only return probabilities as a `float`. (default False)
laplace: Smoothing parameter >= 0. (default 1)
top_features: Number indicating the top-k most common features
to use during classification, sorted by the frequency the
feature has been seen (a count is kept for each label). This
is a form of feature selection because any feature that has
a frequency less than any of the top-k most common features
is ignored during classification. This value must be set
before any training of the classifier. (default None)
labels: Set of all class labels.
vocabulary: Set of vocabulary across all class labels.
Prediction = namedtuple('Prediction', 'label confidence')
def __init__(self, *documents):
"""Create a new Multinomial Naive Bayes classifier.
documents: Optional list of document-label pairs for training.
self.exact = False
self.laplace = 1
self.top_features = None
# Dictionary of sets of vocabulary by label.
self._label_vocab = defaultdict(set)
# Dictionary of times a label has been seen.
self._label_count = Counter()
# Dictionary of number of feature seen in all documents by label.
self._label_length = Counter()
# Dictionary of times a feature has been seen by label.
self._label_feature_count = defaultdict(Counter)
# Size of vocabulary across all class labels.
self._vocab_size = 0
if documents:
def labels(self):
"""Set of all class labels.
Example: set(['positive', 'negative'])
return set(label for label in self._label_count)
def vocabulary(self):
"""Set of vocabulary (features) seen in any class label."""
label_vocab = [self._label_vocab[x] for x in self._label_vocab]
return set().union(*label_vocab)
def train(self, *documents):
"""Train the classifier on a document-label pair(s).
documents: Tuple of (document, label) pair(s). Documents
must be a collection of features. The label can be any
hashable object, though is usually a string.
for document, label in documents:
# Python 3: isinstance(document, str)
if isinstance(document, basestring):
raise TypeError('Documents must be a collection of features')
self._label_count[label] += 1
for feature in document:
# Check if the feature hasn't been seen before for any label.
if not any(feature in self._label_vocab[x] for x in self.labels):
self._vocab_size += 1
self._label_feature_count[label][feature] += 1
self._label_length[label] += 1
if self.top_features:
if not hasattr(self, '_most_common'):
x = lambda: MostCommon(self.top_features)
self._most_common = defaultdict(x)
y = self._label_feature_count[label][feature]
self._most_common[label][feature] = y
def prior(self, label):
"""Prior probability of a label.
label: The target class label.
The number of training instances that had the target
`label`, divided by the total number of training instances.
if label not in self.labels:
raise KeyError(label)
total = sum(self._label_count.values())
if self.exact:
return Fraction(self._label_count[label], total)
return self._label_count[label] / total
def conditional(self, feature, label):
"""Conditional probability for a feature given a label.
feature: The target feature.
label: The target class label.
The number of times the feature has been present across all
training documents for the `label`, divided by the sum of
the length of every training document for the `label`.
# Note we use [Laplace smoothing][laplace].
# [laplace]:
if label not in self.labels:
raise KeyError(label)
# Times feature seen across all documents in a label.
numer = self.laplace
# Avoid creating an entry if the term has never been seen
if feature in self._label_feature_count[label]:
numer += self._label_feature_count[label][feature]
denom = self._label_length[label] + (self._vocab_size * self.laplace)
if self.exact:
return Fraction(numer, denom)
return numer / denom
def _score(self, document, label):
"""Multinomial raw score of a document given a label.
document: Collection of features.
label: The target class label.
The multinomial raw score of the `document` given the
`label`. In order to turn the raw score into a confidence
value, this value should be divided by the sum of the raw
scores across all class labels.
if isinstance(document, basestring):
raise TypeError('Documents must be a list of features')
if self.exact:
score = self.prior(label)
score = math.log(self.prior(label))
for feature in document:
# Feature selection by only considering the top-k
# most common features (a form of dictionary trimming).
if self.top_features and feature not in self._most_common[label]:
conditional = self.conditional(feature, label)
if self.exact:
score *= conditional
score += math.log(conditional)
return score
def _compute_scores(self, document):
"""Compute the multinomial score of a document for all labels.
document: Collection of features.
A dict mapping class labels to the multinomial raw score
for the `document` given the label.
return {x: self._score(document, x) for x in self.labels}
def prob_all(self, document):
"""Probability of a document for all labels.
document: Collection of features.
A dict mapping class labels to the confidence value that the
`document` belongs to the label.
score = self._compute_scores(document)
if not self.exact:
# If the log-likelihood is too small, when we convert back
# using `math.exp`, the result will round to zero.
normalize = max(score.itervalues())
assert normalize <= 0, normalize
score = {x: math.exp(score[x] - normalize) for x in score}
total = sum(score[x] for x in score)
assert total > 0, (total, score, normalize)
if self.exact:
return {label: Fraction(score[label], total) for label in
return {label: score[label] / total for label in self.labels}
def prob(self, document, label):
"""Probability of a document given a label.
document: Collection of features.
label: The target class label.
The confidence value that the `document` belongs to `label`.
prob = self.prob_all(document)[label]
return prob
def classify(self, document):
"""Get the most confident class label for a document.
document: Collection of features.
A namedtuple representing the most confident class `label`
and the value of the `confidence` in the label. For example:
As tuple:
('positive', 0.85)
As namedtuple:
Prediction(label='positive', confidence=0.85)
prob = self.prob_all(document)
label = max(prob, key=prob.get)
return self.Prediction(label, prob[label])
# Copyright (C) 2013 Wesley Baugh
"""Detect watermark in images.
### Requires
- [Pillow](
import glob
from classify import MultinomialNB
from PIL import Image
TRAINING_POSITIVE = 'training-positive/*.jpg'
TRAINING_NEGATIVE = 'training-negative/*.jpg'
TEST_POSITIVE = 'test-positive/*.jpg'
TEST_NEGATIVE = 'test-negative/*.jpg'
# How many pixels to grab from the top-right of image.
RESIZED = (16, 16)
def get_image_data(infile):
image =
width, height = image.size
# left upper right lower
box = width - CROP_WIDTH, 0, width, CROP_HEIGHT
region = image.crop(box)
resized = region.resize(RESIZED)
data = resized.getdata()
# Convert RGB to simple averaged value.
data = [sum(pixel) / 3 for pixel in data]
# Combine location and value.
values = []
for location, value in enumerate(data):
values.extend([location] * value)
return values
def main():
watermark = MultinomialNB()
# Training
count = 0
for infile in glob.glob(TRAINING_POSITIVE):
data = get_image_data(infile)
watermark.train((data, 'positive'))
count += 1
print 'Training', count
for infile in glob.glob(TRAINING_NEGATIVE):
data = get_image_data(infile)
watermark.train((data, 'negative'))
count += 1
print 'Training', count
# Testing
correct, total = 0, 0
for infile in glob.glob(TEST_POSITIVE):
data = get_image_data(infile)
prediction = watermark.classify(data)
if prediction.label == 'positive':
correct += 1
total += 1
print 'Testing ({0} / {1})'.format(correct, total)
for infile in glob.glob(TEST_NEGATIVE):
data = get_image_data(infile)
prediction = watermark.classify(data)
if prediction.label == 'negative':
correct += 1
total += 1
print 'Testing ({0} / {1})'.format(correct, total)
print 'Got', correct, 'out of', total, 'correct'
if __name__ == '__main__':
Copy link

FYI , I see you specify to use pillow 2.0, but if Im dumb and ignore that and use pillow 4.0, I only get 9 out 10 matches using python 2.7.5 . Nothing a bit more training probably wouldnt solve though! Nice work btw!

Copy link

Cajet0 commented Jan 3, 2024

2023, used python 3.9 with pillow 4.0. Got 9/10 with the provided images also. Added my personal images to training pos/neg and tests. Got 16/21

Edit: Solved it looking at the code more thoroughly.

You have to use your custom CROP_WITDH and CROP_HEIGHT and also modify the crop coordinates according to your training images examples. Got 11/11 after that.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment