Opasnet Base Connection for R
Moderator:Teemu R (see all) |
|
Upload data
|
Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code to R; this defines the functions, after which they can be called for in R.
Package dependencies
These packages are required for most of the code to work. Load these first (copy-pasteable).
library(utils) library(RODBC)
Setup
Establishes the connection to the database. Copy paste first.
db <- odbcConnect("opasnet_base")
- Assuming "opasnet_base" is a correctly defined DSN.
- Note that uploading requires writing permissions.
Downloading data
Functions
- Now simplified, a lot faster. Copy paste second.
op_baseGetData <- function(ident) { obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1] series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, ' ORDER BY series_id DESC LIMIT 1'))[1,1] Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.name AS ind, loc.location AS loc, res.result', ' FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', ' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id', ' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, ' AND actobj.series_id = ', series_id, ' ORDER BY cell.id, res.obs, obj.name', sep = '')) nind <- length(levels(Data[,3])) ncell <- nrow(Data)/nind dataframe <- data.frame(NA) dataframe <- Data[1:ncell*nind, c(1,2)] for (i in 1:nind) { dataframe[,2 + i] <- Data[1:ncell*nind - i + 1, 4] colnames(dataframe)[2 + i] <- as.character(Data[nind - i + 1, 3]) } dataframe[,3 + nind] <- Data[1:ncell*nind, 5] colnames(dataframe)[3 + nind] <- "Result" rownames(dataframe) <- 1:ncell dataframe }
Usage
variable <- op_baseGetData("page identifier")
- Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result.
Manipulating data
Functions
DataframeToArray <- function(dataframe) { DimNames <- rep(vector("list", 1), ncol(dataframe) - 2) names(DimNames) <- c(colnames(dataframe)[2:(ncol(dataframe) - 1)]) nind <- length(names(DimNames)) indlengths <- 0 for (i in 1:nind) { DimNames[[i]] <- levels(factor(dataframe[,1 + i])) indlengths[i] <- length(DimNames[[i]]) } array <- rep(NA, prod(indlengths)) dim(array) <- indlengths dimnames(array) <- DimNames array[as.matrix(dataframe[,c(1:nind + 1)])] <- dataframe[,ncol(dataframe)] array }
Usage
variable2 <- DataframeToArray(variable1)
- variable 1 must be in the same format as the result when downloading.
- first column (id) is ignored
- variable1 <- function(variable1) is allowed
Other useful stuff
as.data.frame(as.table(array)) dataframe[is.na(dataframe[,ncol(dataframe)])==FALSE,]
- First line returns an array from a data frame.
- Second line returns rows without empty values.
dataframe[grep("location", dataframe$index, ignore.case = TRUE),]
- Returns all rows of a data frame where column "index" value is "location".
Uploading data
Slight problems with probabilistic data, sorting orders do not match between read and write. Very efficient. Probabilistic data supported. Appending to existing data not supported, a whole new set will always be produced. Varying numbers of iterations between cells not supported.
Functions
op_baseWrite <- function(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL) { #Coerce input into a data frame if it isn't one already; get rid of empty cells if (is.array(input)) dataframe <- as.data.frame(as.table(input)) else dataframe <- input rescol <- colnames(dataframe) == "Freq" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Freq" else { rescol <- colnames(dataframe) == "Result" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Result" else { rescol <- colnames(dataframe) == "result" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "result" } } values <- dataframe[,rescol] cond <- (is.na(values) == FALSE) dataframe <- dataframe[cond,] #Add page to database (if it doesn't already exist) if (is.null(ident)==TRUE) if (interactive()) ident <- readline(paste("What is the identifier of this object?", "\n", sep = "")) else stop("indentifier of object no given") wiki_name <- substring(ident, 1, 5) if (wiki_name=="Op_en") {wiki_id <- 1; page <- substring(ident, 6, nchar(ident))} else if (wiki_name=="Op_fi") { wiki_id <- 2; page <- substring(ident, 6, nchar(ident))} else {wiki_id <- 0; page <- 0} obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1] if (is.na(obj_id)==TRUE) { if (is.null(name)==TRUE) if (interactive()) name <- readline(paste("What is the name of this object?", "\n", sep = "")) else stop("object name not given") if (is.null(unit)==TRUE) if (interactive()) unit <- readline(paste("What is the unit of this object?", "\n", sep = "")) else stop("unit not given") if (is.null(objtype_id)==TRUE) if (interactive()) objtype_id <- readline(paste("What type of object is", " this (id)?", paste(paste(sqlQuery(db, "SELECT id FROM objtype")[,1], sqlQuery(db, "SELECT objtype", " FROM objtype")[,1], sep = " - "), collapse = ", "), "\n", collapse = " ")) else { stop("object type not given")} sqlQuery(db, paste('INSERT INTO obj (ident, name, unit, objtype_id, page, wiki_id) VALUES ("', paste(ident, name, unit, sep = '","'), '",', paste(objtype_id, page, wiki_id, sep = ','), ')', sep = '')) obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1] } #Write act and actobj if (is.null(who)==TRUE) if (interactive()) {who <- readline(paste("What is the name of the uploader?", "\n", sep = "")) } else stop("uploader name not given") sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (4, "', who, '","R upload")', sep = '')) act_id <- sqlQuery(db, "SELECT id FROM act ORDER BY id DESC LIMIT 1")[1,1] sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id) VALUES (', paste(act_id, obj_id, act_id, sep = ','), ')', sep = '')) actobj_id <- sqlQuery(db, paste('SELECT id FROM actobj WHERE act_id = ', act_id, sep = ''))[1,1] #Write indexes ColNames <- colnames(dataframe[(colnames(dataframe) == "id") == FALSE]) ColNames <- ColNames[(ColNames == "obs") == FALSE] ColNames <- ColNames[(ColNames == "Result") == FALSE] ColNames <- ColNames[(ColNames == "result") == FALSE] ColNames <- ColNames[(ColNames == "Freq") == FALSE] nind <- length(ColNames) DimNames <- rep(vector("list", 1), nind) names(DimNames) <- ColNames indlengths <- 0 for (i in 1:nind) { DimNames[[i]] <- levels(factor(dataframe[, ColNames[i]])) indlengths[i] <- length(DimNames[[i]]) } indQuery <- NA indQuery[1:nind*5 - 4] <- gsub(" ", "_", ColNames) indQuery[1:nind*5 - 3] <- '","' indQuery[1:nind*5 - 2] <- ColNames indQuery[1:nind*5 - 1] <- '",6' indQuery[1:(nind - 1)*5] <- '),("' sqlQuery(db, paste('INSERT IGNORE INTO obj (ident, name, objtype_id) VALUES ("', paste(indQuery, collapse = ), ')', sep = '')) IndIds <- sqlQuery(db, paste('SELECT id, ident FROM obj WHERE ident IN("', paste(gsub(" ", "_", ColNames), collapse = '","'), '")', sep = '')) DimIds <- DimNames DimN <- 1:nind names(DimN) <- gsub(" ", "_", ColNames) for (i in 1:nrow(IndIds)) { names(DimIds)[DimN[IndIds[i,2]]] <- IndIds[grep(gsub(" ", "_", ColNames[i]), IndIds$ident), 1] } #Write locations y <- 1 LocNames <- as.data.frame(matrix(rep(NA, 2*sum(indlengths)), sum(indlengths), 2)) for (i in 1:nind) { LocNames[y:(y + indlengths[i] - 1),1:2] <- matrix(c(rep(ColNames[i], indlengths[i]), levels(factor(dataframe[,ColNames[i]]))), indlengths[i], 2) y <- y + indlengths[i] } LocNames[,1] <- names(DimIds)[DimN[LocNames[,1]]] nloc <- nrow(LocNames) locQuery <- NA locQuery[1:nloc*4 - 3] <- LocNames[,1] locQuery[1:nloc*4 - 2] <- ',"' locQuery[1:nloc*4 - 1] <- LocNames[,2] locQuery[1:(nloc - 1)*4] <- '"),(' sqlQuery(db, paste('INSERT IGNORE INTO loc (obj_id_i, location) VALUES (', paste(locQuery, collapse = ), '")', sep = '')) LocIds <- sqlQuery(db, paste('SELECT id, obj_id_i, location FROM loc WHERE location IN("', paste(LocNames[,2], collapse = '","'), '") AND obj_id_i IN(', paste(names(DimIds), collapse = ','), ')', sep = '')) LocMap <- NA y <- 1 for (i in 1:nind) { LocMap <- LocIds[grep(names(DimIds)[i], LocIds$obj_id_i), 1] names(LocMap) <- LocIds[grep(names(DimIds)[i], LocIds$obj_id_i), 3] DimIds[[i]] <- LocMap[DimNames[[i]]] } #Changing location names in table into ids for (i in 1:nind) { dataframe[,ColNames[i]] <- factor(dataframe[,ColNames[i]]) levels(dataframe[,ColNames[i]]) <- DimIds[[i]] } #A hidden parameter for adjusting query packet sizes, the higher the faster, though crash becomes likelier maxrows <- 50000 #Writing cell obscol <- colnames(dataframe) == "obs" if (length(obscol[obscol==TRUE]) >= 1) { obscol <- TRUE n <- length(levels(factor(dataframe[,"obs"])))} else { obscol <- FALSE n <- 1 } ncell <- nrow(dataframe)/n cellQuery <- NA cellQuery[1:ncell*6-5] <- actobj_id cellQuery[1:ncell*6-4] <- "," if (n == 1) { cellQuery[1:ncell*6-3] <- dataframe[,rescol]} else { cellQuery[1:ncell*6-3] <- apply(matrix(dataframe[,rescol], n, ncell), 2, mean) } cellQuery[1:ncell*6-2] <- "," cellQuery[1:ncell*6-1] <- n cellQuery[1:(ncell-1)*6] <- "),(" if (ncell>=maxrows) { for (i in 1:(ncell%/%maxrows)) { sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[ ((i-1)*6*maxrows+1):(i*6*maxrows-1)], collapse = ''), ')', sep = '')) } } sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[(ncell%/% maxrows*6*maxrows+1):length(cellQuery)], collapse = ''), ')', sep = '')) #Writing res cell_id <- sqlQuery(db, paste("SELECT id FROM cell WHERE actobj_id = ", actobj_id, " ORDER BY ID", sep = ""))[,1] if (ncell != length(cell_id)) stop("Number of cells written does not match with expected value") resQuery <- NA resQuery[1:nrow(dataframe)*6-5] <- rep(cell_id, n) resQuery[1:nrow(dataframe)*6-4] <- "," if (n==1) resQuery[1:nrow(dataframe)*6-3] <- "0" else resQuery[1:nrow(dataframe)*6-3] <- dataframe[,"obs"] resQuery[1:nrow(dataframe)*6-2] <- "," resQuery[1:nrow(dataframe)*6-1] <- dataframe[,rescol] resQuery[1:(nrow(dataframe)-1)*6] <- "),(" if (nrow(dataframe)>=maxrows) { for (i in 1:(nrow(dataframe)%/%maxrows)) { sqlQuery(db, paste('INSERT INTO res (cell_id, obs, result) VALUES (', paste(resQuery[((i-1) *4*maxrows+1):(i*4*maxrows-1)], collapse = ''), ')', sep = '')) } } sqlQuery(db, paste('INSERT INTO res (cell_id, obs, result) VALUES (', paste(resQuery[(nrow(dataframe)%/%maxrows *4*maxrows+1):length(resQuery)], collapse = ''), ')', sep = '')) #Writing loccell locidmatrix <- rep(NA, nind*ncell) dim(locidmatrix) <- c(nind,ncell) for (i in 1:nind) { locidmatrix[i,] <- as.character(dataframe[1:ncell*n - n + 1,ColNames[i]]) } loccellQuery <- rep(NA, 4*nind*ncell) dim(loccellQuery) <- c(4*nind, ncell) loccellQuery[1:nind*4-3, 1:ncell] <- rep(cell_id, each = nind) loccellQuery[1:nind*4-2, 1:ncell] <- "," loccellQuery[1:nind*4-1, 1:ncell] <- locidmatrix loccellQuery[1:nind*4, 1:ncell] <- "),(" loccellQuery[nind*4, ncell] <- "" if (ncell>=maxrows) { for (i in 1:(ncell%/%maxrows)) { loccellQuery[nind*4, i*maxrows] <- "" sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[, ((i-1)*maxrows+1):(i*maxrows)], collapse = ''), ')', sep = '')) } } sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,(ncell%/% maxrows*maxrows+1):ncell], collapse = ''), ')', sep = '')) }
Usage
op_baseWrite(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL)
- Db and input must be defined, the rest if not defined is prompted for by the function.
- Input may be given in either array or data.frame form.
- Input in data.frame form must contain a "Freq", "Result" or "result" column, this is where the numerical value is read.
- Probabilistic input must contain a dimension (in case of array) or a column (in case of data.frame) named "obs". Also: all cells must have values for each "obs" value.