R - Query & Insert from Hive
Github project : example-R-Query-Insert-From-Hive
Dependencies
R package
rjdbc : https://cran.r-project.org/web/packages/RJDBC/index.html
This allows R to connect to any DBMS that has a JDBC driver.
Hive JDBC drivers
Download the Hive JDBC drivers
Parameters
- IP : Internet Protocol
- Port
- User_HDFS
- Password_HDFS
- Hive_Jdbc_Folder_Path : Path where the folder Hive JDBC is here.
Code explanation for Query and Insert from Hive
Loading JDBC driver and connection
drv <- JDBC(driverClass = "org.apache.hive.jdbc.HiveDriver", classPath = list.files("Hive_Jdbc_Folder_Path",pattern="jar$",full.names=T), identifier.quote="`") hiveConnectionUrl <- "jdbc:hive2://IP:Port/;ssl=false" conn <- dbConnect(drv, hiveConnectionUrl, "User_HDFS", "Password_HDFS")
Query and Insert : Examples
Show Tables
# All databases dbListTables(conn) # The database "default" dbGetQuery(conn, "show tables")
Get all elements of a table
# In database "default" d <- dbReadTable(conn, "table_name") # OR d <- dbGetQuery(conn, "select * from table_name") # Other that the database "default" d <- dbReadTable(conn, nameBDD.table_name)
Create a table in parquet format
dbSendUpdate(conn, "CREATE TABLE table_name (attribute1 string, attribute2 int) STORED AS PARQUET")
Insert data into a table
dbGetQuery(conn, "INSERT INTO table_name VALUES ('test', 1)")
Alternative solution using ODBC
ODBC
# Retrieve Hive drivers for Debianoids from Cloudera if (!dir.exists("driver-hive")) dir.create("driver-hive") if (!length(Sys.glob("driver-hive/clouderahiveodbc_*.deb"))) download.file(paste0("https://downloads.cloudera.com/connectors/hive-2.5.25.1020/", "Debian/clouderahiveodbc_2.5.25.1020-2_amd64.deb"), destfile = "driver-hive/clouderahiveodbc_2.5.25.1020-2_amd64.deb") # Install driver system("sudo dpkg -i driver-hive/clouderahiveodbc_*.deb", intern = TRUE) # Setup driver and /etc/odbcinst.ini file system("sudo apt-get install -y odbcinst", intern = TRUE) system("sudo odbcinst -i -d -f /opt/cloudera/hiveodbc/Setup/odbcinst.ini", intern = TRUE) # Install ODBC system("sudo apt-get install -y unixodbc-dev", intern = TRUE) # Install R package for ODBC pkg <- "odbc" if (!length(find.package(pkg, quiet = TRUE))) install.packages(pkg, repos = "https://cloud.r-project.org") con <- DBI::dbConnect(odbc::odbc(), Driver = "Cloudera ODBC Driver for Apache Hive 64-bit", Host = "nn1", Port = "10000", Schema = "default", AuthMech = 3, UseSASL = 1, UID = "mysaagieidasfirstname.lastname", PWD = URLencode(Sys.getenv("mysaagiepassword"))) # Check connection (list tables in schema) DBI::dbGetQuery(con, "show tables") # Example data frame dat <- head(iris) # Make syntactically valid names (remove dots) names(dat) <- make.names(gsub("[._]", replacement = "", names(dat))) # Write to Hive (and possibly overwrite) DBI::dbSendQuery(con, "drop table if exists iris") DBI::dbWriteTable(con, "iris", dat) # Read back dat2 <- DBI::dbReadTable(con, "iris") # Or: dat2 <- DBI::dbGetQuery(con, "select * from iris")