Last active
September 19, 2018 13:09
-
-
Save matt-dray/924421d57c3f568d4bc2d6465e00f02c to your computer and use it in GitHub Desktop.
Use odbc and DBI to connect to two SQL databases, query them and join the outputs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Connecting R to SQL (with DBI and odbc) | |
# Original: 18 July 2018 | |
# Matt Dray | |
# Goal: join two datasets from different databases | |
# Basic approach: | |
# 1. Connect to database A, perform query, disconnect | |
# 2. As above for database B | |
# 3. Join the two dataframes returned from steps 1 and 2 | |
# Packages ---------------------------------------------------------------- | |
# Install the packages if you haven't already | |
# install.packages(c("DBI", "odbc", "dplyr", "devtools", "httr") | |
library(DBI) # R database interface | |
library(odbc) # driver for an ODBC database | |
library(dplyr) # data manipulation | |
library(devtools) # to donwload dfeR package from github | |
library(httr) # to help change proxy to get dfeR package | |
# The dfeR package is a work-in-progress package of functions used throughout | |
# the department; you can get involved by submitting your own functions. | |
# See https://dfe-analytical-services.github.io/dfeR/ for documentation | |
# See https://github.com/dfe-analytical-services/dfeR for code on GitHub | |
# dfeR isn't on CRAN (the normal place for downloading packages), so | |
# install.packages("dfeR") won't work. We need install_github() from the | |
# devtools package. You need R version 3.4.1 or later. | |
# To install packages from GitHub, we also need to change our proxy settings | |
# (this allows us to access GitHub). | |
# Let's change the proxy. Boxes will pop up asking for your work username and | |
# password. Once you've completed these, you'll be good to go. | |
set_config( | |
use_proxy( | |
url = paste( | |
"http://ad\\", | |
rstudioapi::showPrompt( | |
title = "Username", | |
message = "Username", | |
default = "" | |
), | |
":", | |
rstudioapi::askForPassword("AD account password"), | |
"@192.168.2.40:8080", | |
sep = "" | |
), | |
port = 8080 | |
) | |
) | |
# If you want to remember this code, bookmark this page: | |
# https://gist.github.com/matt-dray/94bea6823395c4639216da798f584e23 | |
# Now we can install from GitHub (verbose download information will appear in | |
# the console during installation). | |
devtools::install_github("dfe-analytical-services/dfeR") # install | |
library("dfeR") | |
# Connect ----------------------------------------------------------------- | |
# Create the database connection string (you must have access to the database). | |
# i.e. Provide the 'address' of the database you're connecting to. Note that | |
# we're using the sql_conn_string() function from the dfeR package. | |
conn_string <- dfeR::sql_conn_string( | |
server = "0XXXXX-XXX00\\XXXXXX", # change to your server | |
database = "XXXX" # change to your database | |
) | |
# Now create an object that stores the connection details | |
conn <- dbConnect( # function to create connection | |
odbc::odbc(), # driver for ODBC database | |
.connection_string = conn_string # the connection information | |
) | |
# Query first database ---------------------------------------------------- | |
# There is a prewritten function in the dfeR package that reads an SQL | |
# script you've stored somewhere (see the 'dfeR package' section above for how | |
# to install this package). Simply provide a filepath to your script. Note that | |
# this function strips out the comments that are are in the stored .sql file. | |
query1 <- read_sql_script(file = "script/query1.sql") # filepath to .sql query | |
# Have a look at the query you just read in | |
prewritten_query | |
# And now we can use the query and connection in the dbGetQuery() function | |
query1_df <- dbGetQuery( # function for sending a query to a database | |
conn, # to this database connection | |
query1 # perfrom this query | |
) | |
# Take a look at dataframe that's been returned to check everything's in order | |
glimpse(query1_df) | |
# Now we should disconnect from this database | |
dbDisconnect(conn) | |
# Query second database --------------------------------------------------- | |
# Now build a new connection string with sql_conn_string() for the other | |
# database you need to extract data from and connect to it with the dbConnect() | |
# function. | |
# Then use read_sql_script() to perform the query; making sure to give a | |
# sensible name to the dataframe object that's returned (i.e. i called the first | |
# dataframe 'query1_df' in the section above, so why not call this one | |
# query2_df?) | |
# Remember to disconnect from the database with dbDisconnect when you're done. | |
# Join -------------------------------------------------------------------- | |
# So we have query1_df and query2_df. They're dataframes that contain data from | |
# two different databases. Assuming they have a matching key, we can use a join | |
# function from dplyr to match them. Let's assume the key column name is 'id' in | |
# our first dataframe, but 'ID' in the second dataframe. If the matching key | |
# column has same name in both dataframes, then 'by = "ID"' will suffice. | |
joined_data <- left_join( | |
x = query1_df, | |
y = query2_df, | |
by = c("id" = "ID") | |
) | |
# So now the datsets should be joined | |
glimpse(joined_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment