Skip to content

Instantly share code, notes, and snippets.

@i000313
Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save i000313/c8a40312ebeb3b8ed22d to your computer and use it in GitHub Desktop.
Save i000313/c8a40312ebeb3b8ed22d to your computer and use it in GitHub Desktop.
R function to get a "did you mean" suggestion from Google #R #didyoumean
############################################################################
# This file has the R functions didYouMean() and didYouMean2().
# The function didYouMean() is a copy found in the "didYouMean.R" file of
# the repository: https://github.com/samcarlos/didYouMean .
#
# The function didYouMean(input) is described below and was originally
# developed by "Sam Carlos". I just wrote a few comments to help me understand
# how this function is implemented. I also added the call to the options(...)
# function, in order to solve a SSL error on Windows 7.
#
# The function didYouMean2(input) was developed by me in order to handle
# accented characters and other special characters.
#
# IMPORTANT NOTE:
# The use of the these two functions may contravene Googles terms of use.
# As such if you call this function many times or they otherwise found out then
# this feature could be blocked to you by google.
#
# @date Sat May 24 00:30:39 2014
############################################################################
# Load the RCurl in order to use the getURL() function
library(RCurl)
# Set SSL certs globally. This is needed in Windown 7, to prevent the SSL
# error (R version 3.0.2, RCurl_1.95-4.1):
# Error: SSL certificate problem, verify that the CA cert is OK. Details:
# error:14090086:SSL routines:SSL3_GET_SERVER_CERTIFICATE:certificate verify failed
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
############################################################################
# This function takes a string with errors and returns the google words that
# come after "Did you mean" or "Showing results for".
#
# @param input string to search on Google.
# @return the string suggested by Google or the original inputted string if
# none was suggested.
#
# Example 1:
# > didYouMean("senhur") # Wrong Portuguese word
# [1] "senhor" # Correct Portuguese word
#
# Example 2:
# > didYouMean("senhor") # Correct Portuguese word
# [1] "senhor" # Correct Portuguese word
############################################################################
didYouMean=function(input) {
# Replace all the spaces by "+"
input=gsub(" ", "+", input)
# Submit the request to Google and get the HTML page returned by it
doc=getURL(paste("https://www.google.com/search?q=",input,"/", sep=""))
# Searches by the pattern "Did you mean" in the HTML page
dym=gregexpr(pattern ='Did you mean',doc)
# If a match was found We get a list of size 1 with 2 vectors inside:
# [[1]]
# [1] 52633 69902
# attr(,"match.length")
# [1] 12 12
#
# The vector "[1] 52633 69902" tell us that two matches were found. And
# tell us the start offset of each match.
# Searches by the pattern "Showing results for" in the HTML page
srf=gregexpr(pattern ='Showing results for',doc)
# > srf
# [[1]]
# [1] -1
# attr(,"match.length")
# [1] -1
# if the returned HTML page contains the string 'Did you mean'
if(length(dym[[1]])>1){
# Get a sub string from the doc returned by Google. This sub string
# has the string suggested by google.
doc2=substring(doc,dym[[1]][1],dym[[1]][1]+1000)
s1=gregexpr("?q=",doc2)
# > s1
# [[1]]
# [1] 39 75 293 995
# attr(,"match.length")
# [1] 2 2 2 2
# attr(,"useBytes")
# [1] TRUE
s2=gregexpr("/&",doc2)
# > s2
# [[1]]
# [1] 47
# attr(,"match.length")
# [1] 6
# attr(,"useBytes")
# [1] TRUE
# Get the string suggested by Google. This String is between the "?q=" and "/&amp".
# Example: <a href=\"/search?q=senhor/&amp;
new.text=substring(doc2,s1[[1]][1]+2,s2[[1]][1]-1)
# Replace the "+" sign by " ", if any.
return(gsub("[+]"," ",new.text))
break
}
# if the returned HTML page contains the string 'Showing results for'
else if(srf[[1]][1]!=-1){
doc2=substring(doc,srf[[1]][1],srf[[1]][1]+1000)
s1=gregexpr("?q=",doc2)
s2=gregexpr("/&amp;",doc2)
new.text=substring(doc2,s1[[1]][1]+2,s2[[1]][1]-1)
return(gsub("[+]"," ",new.text))
break
}
else(return(gsub("[+]"," ",input)))
}
# The XML library is needed only for the didYouMean2() function
library(XML)
############################################################################
# The didYouMean() function does not handle accented characters.
# The didYouMean2() solve this problem. For example:
#
# > didYouMean("construçãu sivil") # Wrong Portuguese string
# [1] "constru%C3%A7%C3%A3o civil" # Correct Portuguese string (uncoded)
#
# > didYouMean2("construçãu sivil") # Wrong Portuguese string
# [1] ""construção civil"" # Correct Portuguese string
############################################################################
didYouMean2=function(input) {
input <- gsub(" ", "+", input)
url <- paste0("https://www.google.com/search?q=",input)
html=getURL(url)
# parse html
doc = htmlParse(html, asText=TRUE)
plain.text.vct <- xpathSApply(doc, "//a[@class='spell']//text()", xmlValue)
paste(plain.text.vct, collapse="")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment