Skip to content

Instantly share code, notes, and snippets.

Created April 25, 2013 21:03
Show Gist options
  • Star 17 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save bwbaugh/5463151 to your computer and use it in GitHub Desktop.
Save bwbaugh/5463151 to your computer and use it in GitHub Desktop.
Detecting a Specific Watermark in a Photo with Python Get example training and testing images here: <> Stack Overflow question: <>
# Copyright (C) 2013 Wesley Baugh
"""Tools for text classification.
Extracted from the [infer]( library.
from __future__ import division
import math
from collections import defaultdict, namedtuple, Counter
from fractions import Fraction
class MultinomialNB(object):
"""Multinomial Naive Bayes for text classification.
exact: Boolean indicating if exact probabilities should be
returned as a `Fraction`. Otherwise, speed up computations
but only return probabilities as a `float`. (default False)
laplace: Smoothing parameter >= 0. (default 1)
top_features: Number indicating the top-k most common features
to use during classification, sorted by the frequency the
feature has been seen (a count is kept for each label). This
is a form of feature selection because any feature that has
a frequency less than any of the top-k most common features
is ignored during classification. This value must be set
before any training of the classifier. (default None)
labels: Set of all class labels.
vocabulary: Set of vocabulary across all class labels.
Prediction = namedtuple('Prediction', 'label confidence')
def __init__(self, *documents):
"""Create a new Multinomial Naive Bayes classifier.
documents: Optional list of document-label pairs for training.
self.exact = False
self.laplace = 1
self.top_features = None
# Dictionary of sets of vocabulary by label.
self._label_vocab = defaultdict(set)
# Dictionary of times a label has been seen.
self._label_count = Counter()
# Dictionary of number of feature seen in all documents by label.
self._label_length = Counter()
# Dictionary of times a feature has been seen by label.
self._label_feature_count = defaultdict(Counter)
# Size of vocabulary across all class labels.
self._vocab_size = 0
if documents:
def labels(self):
"""Set of all class labels.
Example: set(['positive', 'negative'])
return set(label for label in self._label_count)
def vocabulary(self):
"""Set of vocabulary (features) seen in any class label."""
label_vocab = [self._label_vocab[x] for x in self._label_vocab]
return set().union(*label_vocab)
def train(self, *documents):
"""Train the classifier on a document-label pair(s).
documents: Tuple of (document, label) pair(s). Documents
must be a collection of features. The label can be any
hashable object, though is usually a string.
for document, label in documents:
# Python 3: isinstance(document, str)
if isinstance(document, basestring):
raise TypeError('Documents must be a collection of features')
self._label_count[label] += 1
for feature in document:
# Check if the feature hasn't been seen before for any label.
if not any(feature in self._label_vocab[x] for x in self.labels):
self._vocab_size += 1
self._label_feature_count[label][feature] += 1
self._label_length[label] += 1
if self.top_features:
if not hasattr(self, '_most_common'):
x = lambda: MostCommon(self.top_features)
self._most_common = defaultdict(x)
y = self._label_feature_count[label][feature]
self._most_common[label][feature] = y
def prior(self, label):
"""Prior probability of a label.
label: The target class label.
The number of training instances that had the target
`label`, divided by the total number of training instances.
if label not in self.labels:
raise KeyError(label)
total = sum(self._label_count.values())
if self.exact:
return Fraction(self._label_count[label], total)
return self._label_count[label] / total
def conditional(self, feature, label):
"""Conditional probability for a feature given a label.
feature: The target feature.
label: The target class label.
The number of times the feature has been present across all
training documents for the `label`, divided by the sum of
the length of every training document for the `label`.
# Note we use [Laplace smoothing][laplace].
# [laplace]:
if label not in self.labels:
raise KeyError(label)
# Times feature seen across all documents in a label.
numer = self.laplace
# Avoid creating an entry if the term has never been seen
if feature in self._label_feature_count[label]:
numer += self._label_feature_count[label][feature]
denom = self._label_length[label] + (self._vocab_size * self.laplace)
if self.exact:
return Fraction(numer, denom)
return numer / denom
def _score(self, document, label):
"""Multinomial raw score of a document given a label.
document: Collection of features.
label: The target class label.
The multinomial raw score of the `document` given the
`label`. In order to turn the raw score into a confidence
value, this value should be divided by the sum of the raw
scores across all class labels.
if isinstance(document, basestring):
raise TypeError('Documents must be a list of features')
if self.exact:
score = self.prior(label)
score = math.log(self.prior(label))
for feature in document:
# Feature selection by only considering the top-k
# most common features (a form of dictionary trimming).
if self.top_features and feature not in self._most_common[label]:
conditional = self.conditional(feature, label)
if self.exact:
score *= conditional
score += math.log(conditional)
return score
def _compute_scores(self, document):
"""Compute the multinomial score of a document for all labels.
document: Collection of features.
A dict mapping class labels to the multinomial raw score
for the `document` given the label.
return {x: self._score(document, x) for x in self.labels}
def prob_all(self, document):
"""Probability of a document for all labels.
document: Collection of features.
A dict mapping class labels to the confidence value that the
`document` belongs to the label.
score = self._compute_scores(document)
if not self.exact:
# If the log-likelihood is too small, when we convert back
# using `math.exp`, the result will round to zero.
normalize = max(score.itervalues())
assert normalize <= 0, normalize
score = {x: math.exp(score[x] - normalize) for x in score}
total = sum(score[x] for x in score)
assert total > 0, (total, score, normalize)
if self.exact:
return {label: Fraction(score[label], total) for label in
return {label: score[label] / total for label in self.labels}
def prob(self, document, label):
"""Probability of a document given a label.
document: Collection of features.
label: The target class label.
The confidence value that the `document` belongs to `label`.
prob = self.prob_all(document)[label]
return prob
def classify(self, document):
"""Get the most confident class label for a document.
document: Collection of features.
A namedtuple representing the most confident class `label`
and the value of the `confidence` in the label. For example:
As tuple:
('positive', 0.85)
As namedtuple:
Prediction(label='positive', confidence=0.85)
prob = self.prob_all(document)
label = max(prob, key=prob.get)
return self.Prediction(label, prob[label])
# Copyright (C) 2013 Wesley Baugh
"""Detect watermark in images.
### Requires
- [Pillow](
import glob
from classify import MultinomialNB
from PIL import Image
TRAINING_POSITIVE = 'training-positive/*.jpg'
TRAINING_NEGATIVE = 'training-negative/*.jpg'
TEST_POSITIVE = 'test-positive/*.jpg'
TEST_NEGATIVE = 'test-negative/*.jpg'
# How many pixels to grab from the top-right of image.
RESIZED = (16, 16)
def get_image_data(infile):
image =
width, height = image.size
# left upper right lower
box = width - CROP_WIDTH, 0, width, CROP_HEIGHT
region = image.crop(box)
resized = region.resize(RESIZED)
data = resized.getdata()
# Convert RGB to simple averaged value.
data = [sum(pixel) / 3 for pixel in data]
# Combine location and value.
values = []
for location, value in enumerate(data):
values.extend([location] * value)
return values
def main():
watermark = MultinomialNB()
# Training
count = 0
for infile in glob.glob(TRAINING_POSITIVE):
data = get_image_data(infile)
watermark.train((data, 'positive'))
count += 1
print 'Training', count
for infile in glob.glob(TRAINING_NEGATIVE):
data = get_image_data(infile)
watermark.train((data, 'negative'))
count += 1
print 'Training', count
# Testing
correct, total = 0, 0
for infile in glob.glob(TEST_POSITIVE):
data = get_image_data(infile)
prediction = watermark.classify(data)
if prediction.label == 'positive':
correct += 1
total += 1
print 'Testing ({0} / {1})'.format(correct, total)
for infile in glob.glob(TEST_NEGATIVE):
data = get_image_data(infile)
prediction = watermark.classify(data)
if prediction.label == 'negative':
correct += 1
total += 1
print 'Testing ({0} / {1})'.format(correct, total)
print 'Got', correct, 'out of', total, 'correct'
if __name__ == '__main__':
Copy link

Hi Wesley,

Im running a virtual ubuntu (Linux ubuntu 2.6.32-58-generic #121-Ubuntu SMP Fri May 2 21:33:35 UTC 2014 i686 GNU/Linux)
was trying to run your python script

getting the error below ::
sudo python install
running install
running build
running build_py
running install_lib
byte-compiling /usr/local/lib/python2.6/dist-packages/infer/ to classify.pyc
SyntaxError: ('invalid syntax', ('/usr/local/lib/python2.6/dist-packages/infer/', 278, 47, ' return {x: self._score(document, x) for x in self.labels}\n'))

also tried downloading the gist and installed all the requirements such as pillow 2.0 and tried :
andvine@ubuntu:/naveen$ python
Traceback (most recent call last):
File "", line 10, in
from classify import MultinomialNB
File "/home/sandvine/naveen/", line 192
return {x: self._score(document, x) for x in self.labels}
SyntaxError: invalid syntax

which also throws error. can you please help me on this.

Thanks and Regards,

Copy link

python2.7 needed

Copy link

Looks interesting. Does it get the "specific watermark" from the training set? Because I can't see it been provided as input. Please confirm. Thanks :)

Copy link

FYI , I see you specify to use pillow 2.0, but if Im dumb and ignore that and use pillow 4.0, I only get 9 out 10 matches using python 2.7.5 . Nothing a bit more training probably wouldnt solve though! Nice work btw!

Copy link

Cajet0 commented Jan 3, 2024

2023, used python 3.9 with pillow 4.0. Got 9/10 with the provided images also. Added my personal images to training pos/neg and tests. Got 16/21

Edit: Solved it looking at the code more thoroughly.

You have to use your custom CROP_WITDH and CROP_HEIGHT and also modify the crop coordinates according to your training images examples. Got 11/11 after that.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment