R - Query & Insert from Hive

Dependencies

R package

rjdbc : https://cran.r-project.org/web/packages/RJDBC/index.html

This allows R to connect to any DBMS that has a JDBC driver.

Hive JDBC drivers

Download the Hive JDBC drivers

Parameters

  • IP : Internet Protocol
  • Port
  • User_HDFS
  • Password_HDFS
  • Hive_Jdbc_Folder_Path : Path where the folder Hive JDBC is here.

Code explanation for Query and Insert from Hive

Loading JDBC driver and connection

drv <- JDBC(driverClass = "org.apache.hive.jdbc.HiveDriver",
            classPath = list.files("Hive_Jdbc_Folder_Path",pattern="jar$",full.names=T),
            identifier.quote="`")
hiveConnectionUrl <- "jdbc:hive2://IP:Port/;ssl=false"
conn <- dbConnect(drv, hiveConnectionUrl, "User_HDFS", "Password_HDFS")

Query and Insert : Examples

Show Tables

# All databases
dbListTables(conn) 
# The database "default"
dbGetQuery(conn, "show tables")

Get all elements of a table

# In database "default"
d <- dbReadTable(conn, "table_name")
# OR
d <- dbGetQuery(conn, "select * from table_name")
# Other that the database "default"
d <- dbReadTable(conn, nameBDD.table_name)

Create a table in parquet format 

dbSendUpdate(conn, "CREATE TABLE table_name (attribute1 string, attribute2 int) STORED AS PARQUET")

Insert data into a table

dbGetQuery(conn, "INSERT INTO table_name VALUES ('test', 1)")

Alternative solution using ODBC

ODBC
# Retrieve Hive drivers for Debianoids from Cloudera
if (!dir.exists("driver-hive")) dir.create("driver-hive")
if (!length(Sys.glob("driver-hive/clouderahiveodbc_*.deb"))) 
  download.file(paste0("https://downloads.cloudera.com/connectors/hive-2.5.25.1020/",
                       "Debian/clouderahiveodbc_2.5.25.1020-2_amd64.deb"),
                destfile = "driver-hive/clouderahiveodbc_2.5.25.1020-2_amd64.deb")

# Install driver
system("sudo dpkg -i driver-hive/clouderahiveodbc_*.deb", intern = TRUE)

# Setup driver and /etc/odbcinst.ini file
system("sudo apt-get install -y odbcinst", intern = TRUE)
system("sudo odbcinst -i -d -f /opt/cloudera/hiveodbc/Setup/odbcinst.ini", intern = TRUE)

# Install ODBC
system("sudo apt-get install -y unixodbc-dev", intern = TRUE)

# Install R package for ODBC
pkg <- "odbc"
if (!length(find.package(pkg, quiet = TRUE))) install.packages(pkg, repos = "https://cloud.r-project.org")

con <- DBI::dbConnect(odbc::odbc(),
                      Driver   = "Cloudera ODBC Driver for Apache Hive 64-bit",
                      Host     = "nn1",
                      Port     = "10000",
                      Schema   = "default",
                      AuthMech = 3,
                      UseSASL  = 1,
                      UID      = "mysaagieidasfirstname.lastname",
                      PWD      = URLencode(Sys.getenv("mysaagiepassword")))

# Check connection (list tables in schema)
DBI::dbGetQuery(con, "show tables")

# Example data frame
dat <- head(iris)

# Make syntactically valid names (remove dots)
names(dat) <- make.names(gsub("[._]", replacement = "", names(dat)))

# Write to Hive (and possibly overwrite)
DBI::dbSendQuery(con, "drop table if exists iris")
DBI::dbWriteTable(con, "iris", dat)

# Read back
dat2 <- DBI::dbReadTable(con, "iris")
# Or:
dat2 <- DBI::dbGetQuery(con, "select * from iris")