Introduction to R programming with SQL knowledge

Sample code:

This code is to get data from ES and process it with SQL syntax

#########################
### Load library
#########################
RTA::global_setup()
RTA::load_packages(RTA, dplyr, stringr, jsonlite, sqldf, lubridate)
options(encoding = "UTF-8")
Sys.setlocale(category = "LC_ALL", locale = "en_US.UTF-8")

#########################
### GET ES functions
#########################
download_es_data <- function(index, post_body = "{}", host = "https://es.rta.vn", size = 10000, ...) {
  if (missing(index)) {
    stop("argument 'index' is not provided", call. = FALSE)
  }
  
  url <- URLencode(paste0(host, "/", index, 
                          "/_data?format=csv&size=", size,
                          "&source_content_type=application/json",
                          "&source=", post_body))
  
  df <- data.table::fread(url, quote = "\"", 
                          colClasses = "character", data.table = FALSE, 
                          showProgress = FALSE, ...)
  df
  
}

#########################
### GET Data
#########################
if (is_local()) {
  pcode <- "C752"
  purl <- "https://testcrm.rtworkspace.com"
} else {
  pcode <- '$$FilterValue::get_my_projectcode()$$'
  purl <- '$$FilterValue::get_project_info(FilterValue::get_my_projectcode())$$'
}


std_link_via_qrcode <- download_es_data("std_link_via_qrcode",
                                        paste0("{\"size\":100000,\"collapse\":{\"field\":\"keyid.raw\"},",
                                               "\"sort\":[{\"added_date\":{\"order\":\"desc\"}}]}")) 

#########################
### Process Data
#########################
data <- sqldf(
  "
  SELECT 
    keyid, 
    object_id, 
    object_lb, 
    categ1_lb, 
    categ2_lb, 
    categ3_lb, 
    categ4_lb, 
    categ5_lb, 
    std_image1_path, 
    std_image2_path, 
    std_image3_path, 
    std_image4_path, 
    std_image5_path, 
    std_video_path, 
    is_private, 
    is_manuallyapproved, 
    object_qrcode, 
    true_qrcode, 
    std_scode_id, 
    std_card_id, 
    std_username, 
    std_status_id, 
    std_status_lb, 
    class_creator_id, 
    std_fullname, 
    std_gender_lb, 
    std_dob_fm, 
    std_profile_media1_path, 
    std_profile_media2_path, 
    std_profile_media3_path, 
    std_profile_media4_path, 
    std_profile_media5_path, 
    std_profile_media6_path, 
    address_final, 
    std_phone1, 
    std_phone2, 
    std_email, 
    std_email2, 
    std_code, 
    uni_lb, 
    object_apply_id, 
    object_apply_lb,
    project_code,
    std_project_code
  FROM 
    `std_link_via_qrcode` `main` 
  GROUP BY 
    class_creator_id, 
    std_card_id
  ")


# data_json <- jsonlite::toJSON(result, dataframe = 'values')
data_json <- toJSON(data)
write(data_json, 'result.json')

Let break it down:

1. Load required library:

RTA::global_setup()
RTA::load_packages(RTA, dplyr, stringr, jsonlite, sqldf, lubridate)
options(encoding = "UTF-8")
Sys.setlocale(category = "LC_ALL", locale = "en_US.UTF-8")

Pay attention to the load_packages section, here we will list the packages needed for the script. We can use the following command in console to check what this package supports:

help(package = "package_name")

List of commonly used packages:

  • RTA
  • dplyr, tidyr: used for wrangling data
  • jsonlite: used for handling json
  • lubridate: used for handling datetime
  • stringr: used for manipulating string

Other packages

  • RTAGGS, googlesheets4, googledrive: for working with GGS, Drive (Read, Write data,…)
  • httr: for working with HTTP organised by HTTP verbs (GET(), POST(), etc).

Normally, to process data in R, we often use 2 main packages, dplyr and tidyr (and some others). However, in this article, we will use sqldf as main package, because it is convenient for people with previous SQL knowledge to be familiar with R.

2. Create some function:

#########################
### GET ES functions
#########################
download_es_data <- function(index, post_body = "{}", host = "https://es.rta.vn", size = 10000, ...) {
  if (missing(index)) {
    stop("argument 'index' is not provided", call. = FALSE)
  }
  
  url <- URLencode(paste0(host, "/", index, 
                          "/_data?format=csv&size=", size,
                          "&source_content_type=application/json",
                          "&source=", post_body))
  
  df <- data.table::fread(url, quote = "\"", 
                          colClasses = "character", data.table = FALSE, 
                          showProgress = FALSE, ...)
  df
  
}

This code is to create a function to get data from Elasticsearch (Similar to plugin /_data).
Later Ms. Dieu Linh will include this function in the RTA package.

Usage:

download_es_data(
  index = "index_name",
  post_body = "{}",  #optional, `{}` as default
  host = "https://es.rta.vn", #optional, `https://es.rta.vn` as default
  size = 10000 #optional, `10000` as default
)

3. Get some needed data:

#########################
### GET Data
#########################
if (is_local()) {
  pcode <- "C752"
  purl <- "https://testcrm.rtworkspace.com"
} else {
  pcode <- '$$FilterValue::get_my_projectcode()$$'
  purl <- '$$FilterValue::get_project_info(FilterValue::get_my_projectcode())$$'
}

is_local(): this is to check whether the code run locally or on server

Some examples:

  • Get data from payload (datasource):
if (is_local()) {

    project_code <- 'C785'
    project_url <- 'https://vndemo.rtworkspace.com/'
    spreadsheet_link <- '1w2Po19dtxMj9zXUA3npCb7Fao37rssZOY2JyxQWjEEE'
    org_id <- 'rta'
    submission_date <- now()
    username <- 'rta_xuanle'
    fullname <- "Xuan Dep Chai"
    
} else {
    project_code <- '$$FilterValue::get_my_projectcode()$$'
    project_url <- '$$FilterValue::get_project_info(FilterValue::get_my_projectcode())$$'
    spreadsheet_link <- '@@$params["data"][0]["ss_id"]@@'
    org_id <- '@@$params["data"][0]["org_id"]@@'
    submission_date <- '@@$params["data"][0]["submission_date"]@@'
    username <- '@@$params["data"][0]["username"]@@'
    fullname <- '@@$params["data"][0]["fullname"]@@'
}
  • Get data from payload (Collector):
if (is_local()) {
  project_code <- 'C785'
  submissiondate <- now()
  username <- 'rta_xuanle'
  payload <- jsonlite::fromJSON("/d/payload.json")
  working_data <- jsonlite::fromJSON("/d/data.json")
} else {
  project_code <- '$$FilterValue::get_my_projectcode()$$'
  submissiondate <- '@@$params["data"][0]["submission_date"]@@'
  username <- '@@$params["data"][0]["username"]@@'
  payload <- jsonlite::fromJSON("payload.json")
  working_data <- jsonlite::fromJSON("data.json")
}

4. Get data from ES:

We use download_es_data to fetch data:

std_link_via_qrcode <- download_es_data("std_link_via_qrcode",
                                        paste0("{\"collapse\":{\"field\":\"keyid.raw\"},",
                                               "\"sort\":[{\"added_date\":{\"order\":\"desc\"}}]}")) 

The missing params will be filled with default. This this example, it will use the default host “https://es.rta.vn” and size 1000.

Some more examples:

  • Get list of records from es ss_form_datasetting on C752
project_code <- "C752"

data <- download_es_data(
  "ss_form_datasetting",
  paste0("{\"query\":{\"bool\":{\"must\":[{\"terms\":{\"project_code.keyword\":[\"",
  project_code,
  "\"]}}]}}}")
)

*paste/paste0: like concat in SQL

5. Process data:

It may be difficult for beginner to process data by using dplyr, tidyr. With sqldf, it is easier.
Note: it uses SQLite syntax

#########################
### Process Data
#########################
data <- sqldf(
  "
  SELECT 
    keyid, 
    object_id, 
    object_lb, 
    categ1_lb, 
    categ2_lb, 
    categ3_lb, 
    categ4_lb, 
    categ5_lb, 
    std_image1_path, 
    std_image2_path, 
    std_image3_path, 
    std_image4_path, 
    std_image5_path, 
    std_video_path, 
    is_private, 
    is_manuallyapproved, 
    object_qrcode, 
    true_qrcode, 
    std_scode_id, 
    std_card_id, 
    std_username, 
    std_status_id, 
    std_status_lb, 
    class_creator_id, 
    std_fullname, 
    std_gender_lb, 
    std_dob_fm, 
    std_profile_media1_path, 
    std_profile_media2_path, 
    std_profile_media3_path, 
    std_profile_media4_path, 
    std_profile_media5_path, 
    std_profile_media6_path, 
    address_final, 
    std_phone1, 
    std_phone2, 
    std_email, 
    std_email2, 
    std_code, 
    uni_lb, 
    object_apply_id, 
    object_apply_lb,
    project_code,
    std_project_code
  FROM 
    `std_link_via_qrcode` `main` 
  GROUP BY 
    class_creator_id, 
    std_card_id
  ")

6. Save data:

It is a little bit difference between Datasource and Collector

  • Datasource
#################################
#### Export output
#################################
output <- data

version <- 2

if (version == 1) {
  output <- rbind(colnames(output), output)
  jsonlite::toJSON(output, dataframe = 'values')
  #write.csv(result, 'result.csv', row.names = FALSE)
  
} else if (version == 2) {
  output <- rbind(colnames(output), output)
  data_json <- jsonlite::toJSON(output, dataframe = 'values')
  write(data_json, 'result.json')
  
} else if (version == 3) {
  write.csv(output, 'result.csv', row.names = FALSE)
  # writexl::write_xlsx(output, 'D:/result.xlsx')
}
  • Collector
data_json <- toJSON(data)
write(data_json, 'result.json')

7. Final Template:

#########################
### Load library
#########################
RTA::global_setup()
RTA::load_packages(RTA, dplyr, stringr, jsonlite, sqldf, lubridate)
options(encoding = "UTF-8")
Sys.setlocale(category = "LC_ALL", locale = "en_US.UTF-8")

#########################
### GET ES functions
#########################
download_es_data <- function(index, post_body = "{}", host = "https://es.rta.vn", size = 10000, ...) {
  if (missing(index)) {
    stop("argument 'index' is not provided", call. = FALSE)
  }
  
  url <- URLencode(paste0(host, "/", index, 
                          "/_data?format=csv&size=", size,
                          "&source_content_type=application/json",
                          "&source=", post_body))
  
  df <- data.table::fread(url, quote = "\"", 
                          colClasses = "character", data.table = FALSE, 
                          showProgress = FALSE, ...)
  df
  
}

#########################
### GET Data
#########################
if (is_local()) {
  pcode <- "C752"
  purl <- "https://testcrm.rtworkspace.com"
  # Some extra params that you want to add use
} else {
  pcode <- '$$FilterValue::get_my_projectcode()$$'
  purl <- '$$FilterValue::get_project_info(FilterValue::get_my_projectcode())$$'
  # Some extra params that you want to add use
}


dm1 <- download_es_data(index = "dm1",
                        post_body = "query1",
                        size = 1000)

dm2 <- download_es_data(index = "dm2",
                        post_body = "query2",
                        size = 10000)

dm3 <- download_es_data(index = "dm3",
                        post_body = "query3",
                        size = 500)

#########################
### Process Data
#########################
data <- sqldf(
  "
  SELECT
  x,
  y,
  z
  FROM
  a LEFT JOIN b by a.x = b.y
  LEFT JOIN c by b.y = c.z
  WHERE x IS NOT NULL
  GROUP BY z
  ")


# data_json <- jsonlite::toJSON(result, dataframe = 'values')
data_json <- toJSON(data)
write(data_json, 'result.json')
``