Last active
August 29, 2015 14:01
-
-
Save i000313/c8a40312ebeb3b8ed22d to your computer and use it in GitHub Desktop.
R function to get a "did you mean" suggestion from Google #R #didyoumean
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################ | |
# This file has the R functions didYouMean() and didYouMean2(). | |
# The function didYouMean() is a copy found in the "didYouMean.R" file of | |
# the repository: https://github.com/samcarlos/didYouMean . | |
# | |
# The function didYouMean(input) is described below and was originally | |
# developed by "Sam Carlos". I just wrote a few comments to help me understand | |
# how this function is implemented. I also added the call to the options(...) | |
# function, in order to solve a SSL error on Windows 7. | |
# | |
# The function didYouMean2(input) was developed by me in order to handle | |
# accented characters and other special characters. | |
# | |
# IMPORTANT NOTE: | |
# The use of the these two functions may contravene Googles terms of use. | |
# As such if you call this function many times or they otherwise found out then | |
# this feature could be blocked to you by google. | |
# | |
# @date Sat May 24 00:30:39 2014 | |
############################################################################ | |
# Load the RCurl in order to use the getURL() function | |
library(RCurl) | |
# Set SSL certs globally. This is needed in Windown 7, to prevent the SSL | |
# error (R version 3.0.2, RCurl_1.95-4.1): | |
# Error: SSL certificate problem, verify that the CA cert is OK. Details: | |
# error:14090086:SSL routines:SSL3_GET_SERVER_CERTIFICATE:certificate verify failed | |
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))) | |
############################################################################ | |
# This function takes a string with errors and returns the google words that | |
# come after "Did you mean" or "Showing results for". | |
# | |
# @param input string to search on Google. | |
# @return the string suggested by Google or the original inputted string if | |
# none was suggested. | |
# | |
# Example 1: | |
# > didYouMean("senhur") # Wrong Portuguese word | |
# [1] "senhor" # Correct Portuguese word | |
# | |
# Example 2: | |
# > didYouMean("senhor") # Correct Portuguese word | |
# [1] "senhor" # Correct Portuguese word | |
############################################################################ | |
didYouMean=function(input) { | |
# Replace all the spaces by "+" | |
input=gsub(" ", "+", input) | |
# Submit the request to Google and get the HTML page returned by it | |
doc=getURL(paste("https://www.google.com/search?q=",input,"/", sep="")) | |
# Searches by the pattern "Did you mean" in the HTML page | |
dym=gregexpr(pattern ='Did you mean',doc) | |
# If a match was found We get a list of size 1 with 2 vectors inside: | |
# [[1]] | |
# [1] 52633 69902 | |
# attr(,"match.length") | |
# [1] 12 12 | |
# | |
# The vector "[1] 52633 69902" tell us that two matches were found. And | |
# tell us the start offset of each match. | |
# Searches by the pattern "Showing results for" in the HTML page | |
srf=gregexpr(pattern ='Showing results for',doc) | |
# > srf | |
# [[1]] | |
# [1] -1 | |
# attr(,"match.length") | |
# [1] -1 | |
# if the returned HTML page contains the string 'Did you mean' | |
if(length(dym[[1]])>1){ | |
# Get a sub string from the doc returned by Google. This sub string | |
# has the string suggested by google. | |
doc2=substring(doc,dym[[1]][1],dym[[1]][1]+1000) | |
s1=gregexpr("?q=",doc2) | |
# > s1 | |
# [[1]] | |
# [1] 39 75 293 995 | |
# attr(,"match.length") | |
# [1] 2 2 2 2 | |
# attr(,"useBytes") | |
# [1] TRUE | |
s2=gregexpr("/&",doc2) | |
# > s2 | |
# [[1]] | |
# [1] 47 | |
# attr(,"match.length") | |
# [1] 6 | |
# attr(,"useBytes") | |
# [1] TRUE | |
# Get the string suggested by Google. This String is between the "?q=" and "/&". | |
# Example: <a href=\"/search?q=senhor/& | |
new.text=substring(doc2,s1[[1]][1]+2,s2[[1]][1]-1) | |
# Replace the "+" sign by " ", if any. | |
return(gsub("[+]"," ",new.text)) | |
break | |
} | |
# if the returned HTML page contains the string 'Showing results for' | |
else if(srf[[1]][1]!=-1){ | |
doc2=substring(doc,srf[[1]][1],srf[[1]][1]+1000) | |
s1=gregexpr("?q=",doc2) | |
s2=gregexpr("/&",doc2) | |
new.text=substring(doc2,s1[[1]][1]+2,s2[[1]][1]-1) | |
return(gsub("[+]"," ",new.text)) | |
break | |
} | |
else(return(gsub("[+]"," ",input))) | |
} | |
# The XML library is needed only for the didYouMean2() function | |
library(XML) | |
############################################################################ | |
# The didYouMean() function does not handle accented characters. | |
# The didYouMean2() solve this problem. For example: | |
# | |
# > didYouMean("construçãu sivil") # Wrong Portuguese string | |
# [1] "constru%C3%A7%C3%A3o civil" # Correct Portuguese string (uncoded) | |
# | |
# > didYouMean2("construçãu sivil") # Wrong Portuguese string | |
# [1] ""construção civil"" # Correct Portuguese string | |
############################################################################ | |
didYouMean2=function(input) { | |
input <- gsub(" ", "+", input) | |
url <- paste0("https://www.google.com/search?q=",input) | |
html=getURL(url) | |
# parse html | |
doc = htmlParse(html, asText=TRUE) | |
plain.text.vct <- xpathSApply(doc, "//a[@class='spell']//text()", xmlValue) | |
paste(plain.text.vct, collapse="") | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment