Skip to content

Instantly share code, notes, and snippets.

@ameripour71
Last active August 29, 2015 14:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ameripour71/5e28f609e3d142128e76 to your computer and use it in GitHub Desktop.
Save ameripour71/5e28f609e3d142128e76 to your computer and use it in GitHub Desktop.
the requirements for executing this code:
python 3.3
wordnet for nltk(python library)
stemming.porter2(python library)
#*************************************** be name khoda ***************************************#
import codecs
import collections
from stemming.porter2 import stem # should be installed
import math
from nltk.corpus import wordnet as wn # should be installed
######################################### stop words #########################################
s1 = open("stop_word.txt")
stop_word_list = [] # slist
for line in s1:
stop_word_list.append(line[:len(line)-1])
#print(stop_word_list)
########################################### symbols ###########################################
s2 = codecs.open("symbol.txt", encoding='utf-8')
symbols_list = []
for line in s2:
symbols_list.append(line[:len(line)-2])
#print(symbols_list[i])
########################################## tokenizing ##########################################
s3 = codecs.open("question_answer_pairs.txt", encoding='utf-8')
question_list = []
answer_list = []
for line in s3:
line_parts = line.split('\t')
question_list.append(line_parts[1])
answer_list.append(line_parts[2])
question_list.pop(0)
answer_list.pop(0)
#print(question_list)
#print(answer_list)
##################### tokenizing & calculating needed parameters for tfidf #####################
# tokenizing
token_question_list = []
for i in range(len(question_list)):
tmp = []
tmp = question_list[i].split()
token_question_list.append(tmp)
# removing stop words , symbols and stemming
token_question_list2 = []
token_question_list3 = []
all_tokens_list = [] # all tokens without duplicates
for tmp in token_question_list:
tmp2 = []
#x = 0
#t_c = 0
for t in tmp:
t = t.lower()
if t[len(t)-1:] == '?': # to handle tokens like : physicist? in this dataset (no space between word and symbol)
t = t[:len(t)-1]
if t not in stop_word_list and symbols_list:
t = stem(t)
tmp2.append(t)
if t not in all_tokens_list:
all_tokens_list.append(t)
# create a tuple of (word,number) to show how many times a token has occurred in a question
# the first element in each list of tuples is the most frequent one
counter = collections.Counter(tmp2)
tmp3 = counter.most_common()
#
tmp2_ = list(set(tmp2)) # remove duplicates
token_question_list2.append(tmp2_)
token_question_list3.append(tmp3)
#print(token_question_list2)
#print(token_question_list3)
#print(all_tokens_list)
# frequency of terms in all questions
token_frequency_list = []
for t in all_tokens_list:
frequency = 0
for l in token_question_list2:
if t in l:
frequency += 1
token_frequency_list.append(frequency)
#print(all_tokens_list)
#print(token_frequency_list) # the number of questions that have a token(form all_tokens_list)
##################################### calculating tf & idf #####################################
#tf(t,d) = 0.5 + ( 0.5 * f(t,d) )/( max(f(w,d)):w member of d )
tf_list = []
for i in range(len(token_question_list3)):
tmp = []
for j in range(len(token_question_list3[i])):
x = float((1/2) * float(token_question_list3[i][j][1]))
tf = 0.5 + x/float(token_question_list3[i][0][1])
#print("tf", tf)
tmp.append(tf)
tf_list.append(tmp)
#idf(t,D) = log( N / (the number of d's that contain term t) )
N = len(token_question_list3)
idf_list = []
for i in range(len(token_question_list3)):
tmp = []
for j in range(len(token_question_list3[i])):
x = all_tokens_list.index(token_question_list3[i][j][0])
y = float(N) / float(token_frequency_list[x])
idf = math.log(y)
if idf == 0:
idf = 1 # to avoid zero division
tmp.append(idf)
idf_list.append(tmp)
# tf-idf = tf * idf
tf_idf_list = []
for i in range(len(token_question_list3)):
tmp = []
for j in range(len(token_question_list3[i])):
tmp.append(tf_list[i][j] * idf_list[i][j])
tf_idf_list.append(tmp)
#print(token_question_list3)
#print(tf_list)
#print(idf_list)
#print(tf_idf_list)
####################################### query expansion ######################################
def query_expansion(word):
w = wn.synsets(word)
y = []
for ss in w:
s = str(ss)
y.append(s[8:s.find('.')])
syn_list = list(set(y))
return syn_list
######################################### user input ##########################################
user_query = input("please enter your question : ")
num_return = input("please enter the number of question answer pairs you want to see : ")
user_query_list = user_query.split()
user_query_list_expanded = []
for i in user_query_list:
user_query_list_expanded.append(i)
user_query_list_expanded.extend(query_expansion(i))
user_query_list_expanded2 = list(set(user_query_list_expanded))
#print(user_query_list_expanded2)
user_query_list2 = []
user_query_list3 = []
for i in user_query_list_expanded2:
i = i.lower()
if i not in stop_word_list and symbols_list:
i = stem(i)
user_query_list2.append(i)
counter = collections.Counter(user_query_list2)
user_query_list3 = counter.most_common()
#print(user_query_list3)
# tf
user_tf_list = []
for i in range(len(user_query_list3)):
x = float((1/2) * float(user_query_list3[i][1]))
tf = 0.5 + x/float(user_query_list3[0][1])
user_tf_list.append(tf)
# idf
N = len(token_question_list3)
user_idf_list = []
for i in range(len(user_query_list3)):
idf = 0
if user_query_list3[i][0] not in all_tokens_list:
idf = 0.0
else:
x = all_tokens_list.index(user_query_list3[i][0])
y = float(N) / float(token_frequency_list[x])
idf = math.log(y)
user_idf_list.append(idf)
# tf-idf
user_tf_idf_list = []
for i in range(len(user_query_list3)):
user_tf_idf_list.append(user_tf_list[i] * user_idf_list[i])
#print(user_tf_list)
#print(user_idf_list)
#print(user_tf_idf_list)
############################## calculating vector similarity #################################
# sim (q,d) = ( +(k=1 to t)(Wki * Wkj) ) / ( (+(k=1 to t)(Wki^2))*(+(k=1 to t)(Wkj^2)) ) W=tf*idf
#print(all_tokens_list2)
#print(token_question_list2)
#print(user_query_list2)
#print(all_tokens_list2)
#print(token_question_list2)
#v = list(set(user_query_list2))
#print(v)
all_tokens_list2 = list(set(all_tokens_list))
y = []
for j in range(len(all_tokens_list2)):
if all_tokens_list2[j] not in user_query_list2:
y.append(0)
else:
j1 = list(set(user_query_list2)).index(all_tokens_list2[j])
yy = user_tf_idf_list[j1]
y.append(yy)
b2 = [m*n for m, n in zip(y, y)]
B2 = sum(b2)
sim_list = []
for i in range(len(token_question_list2)):
x = []
for j in range(len(all_tokens_list2)):
if all_tokens_list2[j] not in token_question_list2[i]:
x.append(0)
else:
i1 = token_question_list2[i].index(all_tokens_list2[j])
xx = tf_idf_list[i][i1]
x.append(xx)
a = [m*n for m, n in zip(x, y)]
A = sum(a)
b1 = [m*n for m, n in zip(x, x)]
B1 = sum(b1)
B = B1**0.5 + B2**0.5
if B == 0:
sim_list.append(0)
else:
sim_list.append(A/B)
#print(sim_list)
if len(sim_list) < int(num_return):
num = len(sim_list)
else:
num = int(num_return)
sim_list_ = sim_list
for i in range(int(num)):
best = max(sim_list)
best_index = sim_list.index(best)
print("question : ", question_list[best_index])
print("answer : ", answer_list[best_index])
sim_list_[best_index] = 0
ArticleTitle Question Answer DifficultyFromQuestioner DifficultyFromAnswerer ArticleFile
Alessandro_Volta Is Volta buried in the city of Pittsburgh? no easy easy data/set4/a10
Alessandro_Volta Is Volta buried in the city of Pittsburgh? no easy easy data/set4/a10
Alessandro_Volta Did Volta have a passion for the study of electricity? yes easy medium data/set4/a10
Alessandro_Volta Is it a disadvantage for something to be unsafe to handle? yes hard too hard data/set4/a10
Amedeo_Avogadro Was Amedeo Avogadro Italian? Yes easy easy data/set4/a8
Amedeo_Avogadro Did Amedeo Avogadro graduate? yes easy easy data/set4/a8
Amedeo_Avogadro Where was Avogadro a professor of physics? University of Turin medium medium data/set4/a8
Arabic_language Is arabic source of vocabulary? NULL NULL NULL data/set5/a3
Arabic_language Is arabic language of the Qur an? NULL NULL NULL data/set5/a3
Arabic_language Is it true that most speak Arabic as their native language? NULL NULL NULL data/set5/a3
Arabic_language Are pronouns in Literary Arabic marked for person , number and gender ? yes NULL easy data/set5/a3
Arabic_language However , are non-human plural nouns grammatically considered to be feminine singular ? yes NULL easy data/set5/a3
Bee Is a bee an insect? yes easy easy data/set1/a8
Bee Are bees related to ants? yes easy hard data/set1/a8
Bee Have managed populations of European honey bees experienced substantial declines? yes easy easy data/set1/a8
Bee Where do bees live? in colonies medium medium data/set1/a8
Bee When do bumblebee colonies reach peak population? mid to late summer medium medium data/set1/a8
Bee What do people commonly call cleptoparasitic bees? cuckoo bees medium medium data/set1/a8
Bee How did vespoid wasps evolve? from predatory ancestors hard medium data/set1/a8
Beijing Where does air pollution in Beijing come from? surrounding cities and provinces hard medium data/set3/a7
Beijing Are famous middle schools in Beijing :? Yes NULL easy data/set3/a7
Blaise_Pascal Has the name Pascal been given to the SI unit of pressure? yes easy easy data/set4/a4
Blaise_Pascal Has the name Pascal been given to the SI unit of pressure? yes easy easy data/set4/a4
Blaise_Pascal From what did Pascal suffer throughout his life? ill health medium medium data/set4/a4
Cello What cello manufacturer should I buy from if I want to play outside? Luis & Clark hard hard data/set2/a9
Cello What cello manufacturer should I buy from if I want to play outside? Luis & Clark hard hard data/set2/a9
Charles-Augustin_de_Coulomb Was Charles-Augustin de Coulomb a member of the National Institute? Yes easy easy data/set4/a6
Chinese_language What was Chinese language`s profession? NULL NULL NULL data/set5/a7
Chinese_language What is Chinese language`s first name? NULL NULL NULL data/set5/a7
Flute When was the pan flute spread to other parts of Europe? 7th century BC. hard hard data/set2/a5
Flute What does the air stream across this hole create? A Bernoulli, or siphon. hard medium data/set2/a5
Fox Are foxes wary of humans? Yes easy easy data/set1/a6
Fox Are foxes wary of humans? Yes easy easy data/set1/a6
Fox Are fennec foxes endangered? no easy easy data/set1/a6
Fox Are fennec foxes endangered? No easy easy data/set1/a6
Fox Does the diet of foxes include reptiles? yes easy easy data/set1/a6
Giraffe Do male giraffes weigh more than female giraffes? Yes hard medium data/set1/a5
Giraffe Do male giraffes weigh more than female giraffes? yes hard hard data/set1/a5
Giraffe Do male giraffes have larger horns than female giraffes? Yes hard medium data/set1/a5
Giraffe Do male giraffes have larger horns than female giraffes? Yes hard medium data/set1/a5
Giraffe Are male females generally taller than female giraffes? Yes hard medium data/set1/a5
Henri_Becquerel Was Henri Becquerel one of the discoverers of radioactivity? Yes easy easy data/set4/a9
Henri_Becquerel Was Henri Becquerel one of the discoverers of radioactivity? Yes easy easy data/set4/a9
Isaac_Newton Who did Newton see as the master creator? God medium medium data/set4/a1
Isaac_Newton Who did Newton see as the master creator? Newton saw God as the master creator whose existence could not be denied in the face of the grandeur of all creation medium easy data/set4/a1
Isaac_Newton When did Netwon investigate the refraction of light From 1670 to 1672 hard hard data/set4/a1
Isaac_Newton When did Netwon investigate the refraction of light 1670-1672 hard hard data/set4/a1
James_Watt James Watt's improvements of what were fundamental to the changes wrought by the Industrial Revolution? The steam engine. easy medium data/set4/a2
James_Watt James Watt's improvements of what were fundamental to the changes wrought by the Industrial Revolution? steam engine easy hard data/set4/a2
James_Watt James Watt was born where? Greenock, a seaport on the Firth of Clyde easy medium data/set4/a2
James_Watt James Watt ranked first among how many people in Charles Murray's survey of historiometry? 229 medium hard data/set4/a2
London What city in the UK has been subjected to bouts of terrorism? London hard hard data/set3/a1
London What countries did James VI of Scotland unite? James VI of Scotland united Scotland and England. hard hard data/set3/a1
London What countries did James VI of Scotland unite? England and Scotland hard too hard data/set3/a1
Michael_Faraday What did Joseph Henry likely discover? self-induction medium medium data/set4/a7
Michael_Faraday What did the University of Oxford grant Faraday? a Doctor of Civil Law degree (honorary) medium medium data/set4/a7
Michael_Faraday Who was made to travel outside the coach? Faraday hard hard data/set4/a7
Michael_Faraday Did Faraday construct the ancestor of modern power generators? yes hard easy data/set4/a7
Michael_Faraday Did Faraday lecture on education in 1854? yes hard easy data/set4/a7
Nassau Who was Fort Nassau built by? the Dutch medium medium data/set3/a2
Nassau What is Nassau Coliseum? an arena in Uniondale, New York, USA medium medium data/set3/a2
Nassau What was the Dead or Alive 4 fighting arena modeled after? a Magnetic Accelerator Cannon station from Halo 2 medium medium data/set3/a2
Nassau Is Nassau Range the highest mountain range in the world? no hard easy data/set3/a2
Nassau Is Nassau County named after a German town? yes hard hard data/set3/a2
Nassau Does the United States have a base near Glasgow? yes hard hard data/set3/a2
Nikola_Tesla Did Tesla study electrical engineering? Yes easy easy data/set4/a3
Nikola_Tesla Did Tesla study electrical engineering? yes easy easy data/set4/a3
Santiago Was Santiago the name of an indie/punk band? Yes easy easy data/set3/a6
Santiago Is Santiago a name in Spanish? yes easy easy data/set3/a6
Spanish_language Is it true that mexico contains the largest population of Spanish speakers? NULL NULL NULL data/set5/a8
Spanish_language Mexico contains the largest population of what? NULL NULL NULL data/set5/a8
Spanish_language Is this phenomenon notable in Brazil? NULL NULL NULL data/set5/a8
Swan What is the color of the Australian Black Swan? Completely black except for the white feathers on its wings hard medium data/set1/a10
Tiger Is the Tiger a member of the Felidar family? No. easy easy data/set1/a3
Tiger What are the typical features of a tiger country? Good cover, close to water, and plenty of prey medium medium data/set1/a3
Tiger Would it be common for tigers to mate in January? Yes. hard hard data/set1/a3
Trumpet Does a trumpet have a mellower tone than a cornet? No hard hard data/set2/a3
Turtle Is the name of the upper shell of a turtle called the plastron? No. easy hard data/set1/a4
Turtle Do scutes fall away from the turtle's shell? Some of the scutes eventually fall away from the shell. easy hard data/set1/a4
a
a's
able
about
above
according
accordingly
across
actually
after
afterwards
again
against
ain't
all
allow
allows
almost
alone
along
already
also
although
always
am
among
amongst
an
and
another
any
anybody
anyhow
anyone
anything
anyway
anyways
anywhere
apart
appear
appreciate
appropriate
are
aren't
around
as
aside
ask
asking
associated
at
available
away
awfully
b
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
believe
below
beside
besides
best
better
between
beyond
both
brief
but
by
c
c'mon
c's
came
can
can't
cannot
cant
cause
causes
certain
certainly
changes
clearly
co
com
come
comes
concerning
consequently
consider
considering
contain
containing
contains
corresponding
could
couldn't
course
currently
d
definitely
described
despite
did
didn't
different
do
does
doesn't
doing
don't
done
down
downwards
during
e
each
edu
eg
eight
either
else
elsewhere
enough
entirely
especially
et
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
exactly
example
except
f
far
few
fifth
first
five
followed
following
follows
for
former
formerly
forth
four
from
further
furthermore
g
get
gets
getting
given
gives
go
goes
going
gone
got
gotten
greetings
h
had
hadn't
happens
hardly
has
hasn't
have
haven't
having
he
he's
hello
help
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
hi
him
himself
his
hither
hopefully
how
howbeit
however
i
i'd
i'll
i'm
i've
ie
if
ignored
immediate
in
inasmuch
inc
indeed
indicate
indicated
indicates
inner
insofar
instead
into
inward
is
isn't
it
it'd
it'll
it's
its
itself
j
just
k
keep
keeps
kept
know
knows
known
l
last
lately
later
latter
latterly
least
less
lest
let
let's
like
liked
likely
little
look
looking
looks
ltd
m
mainly
many
may
maybe
me
mean
meanwhile
merely
might
more
moreover
most
mostly
much
must
my
myself
n
name
namely
nd
near
nearly
necessary
need
needs
neither
never
nevertheless
new
next
nine
no
nobody
non
none
noone
nor
normally
not
nothing
novel
now
nowhere
o
obviously
of
off
often
oh
ok
okay
old
on
once
one
ones
only
onto
or
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
own
p
particular
particularly
per
perhaps
placed
please
plus
possible
presumably
probably
provides
q
que
quite
qv
r
rather
rd
re
really
reasonably
regarding
regardless
regards
relatively
respectively
right
s
said
same
saw
say
saying
says
second
secondly
see
seeing
seem
seemed
seeming
seems
seen
self
selves
sensible
sent
serious
seriously
seven
several
shall
she
should
shouldn't
since
six
so
some
somebody
somehow
someone
something
sometime
sometimes
somewhat
somewhere
soon
sorry
specified
specify
specifying
still
sub
such
sup
sure
t
t's
take
taken
tell
tends
th
than
thank
thanks
thanx
that
that's
thats
the
their
theirs
them
themselves
then
thence
there
there's
thereafter
thereby
therefore
therein
theres
thereupon
these
they
they'd
they'll
they're
they've
think
third
this
thorough
thoroughly
those
though
three
through
throughout
thru
thus
to
together
too
took
toward
towards
tried
tries
truly
try
trying
twice
two
u
un
under
unfortunately
unless
unlikely
until
unto
up
upon
us
use
used
useful
uses
using
usually
uucp
v
value
various
very
via
viz
vs
w
want
wants
was
wasn't
way
we
we'd
we'll
we're
we've
welcome
well
went
were
weren't
what
what's
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who's
whoever
whole
whom
whose
why
will
willing
wish
with
within
without
won't
wonder
would
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
zero
،
.
,
'
"
-
=
/
\
|
*
÷
+
!
@
×
~
`
#
$
%
^
&
)
(
}
{
_
[
]
؟
?
>
<
:
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment