Opasnet Base Connection for R: Difference between revisions
Jump to navigation
Jump to search
mNo edit summary |
m (→Usage) |
||
Line 56: | Line 56: | ||
variable <- op_baseGetData("page identifier") | variable <- op_baseGetData("page identifier") | ||
*Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, | *Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result. | ||
==Manipulating data== | ==Manipulating data== |
Revision as of 11:48, 4 October 2010
Moderator:Teemu R (see all) |
|
Upload data
|
Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code to R; this defines the functions, after which they can be called for in R.
Package dependencies
These packages are required for most of the code to work. Load these first (copy-pasteable).
library(utils) library(RODBC)
Setup
Establishes the connection to the database. Copy paste first.
db <- odbcConnect("opasnet_base")
- Assuming "opasnet_base" is a correctly defined DSN.
- Note that uploading requires writing permissions.
Downloading data
Functions
- Now simplified, a lot faster. Copy paste second.
op_baseGetData <- function(ident) { obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=))[1,1] series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, ' ORDER BY series_id DESC LIMIT 1'))[1,1] Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.name AS ind, loc.location AS loc, res.result', ' FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', ' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id', ' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, ' AND actobj.series_id = ', series_id, ' ORDER BY cell.id, res.obs, obj.name', sep = )) nind <- length(levels(Data[,3])) ncell <- nrow(Data)/nind dataframe <- data.frame(NA) dataframe <- Data[1:ncell*nind, c(1,2)] for (i in 1:nind) { dataframe[,2 + i] <- Data[1:ncell*nind - i + 1, 4] colnames(dataframe)[2 + i] <- as.character(Data[nind - i + 1, 3]) } dataframe[,3 + nind] <- Data[1:ncell*nind, 5] colnames(dataframe)[3 + nind] <- "Result" rownames(dataframe) <- 1:ncell dataframe }
Usage
variable <- op_baseGetData("page identifier")
- Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result.
Manipulating data
Functions
DataframeToArray <- function(dataframe) { DimNames <- rep(vector("list", 1), ncol(dataframe) - 2) names(DimNames) <- c(colnames(dataframe)[2:(ncol(dataframe) - 1)]) nind <- length(names(DimNames)) indlengths <- 0 for (i in 1:nind) { DimNames[[i]] <- levels(factor(dataframe[,1 + i])) indlengths[i] <- length(DimNames[[i]]) } array <- rep(NA, prod(indlengths)) dim(array) <- indlengths dimnames(array) <- DimNames array[as.matrix(dataframe[,c(1:nind + 1)])] <- dataframe[,ncol(dataframe)] array }
Usage
variable2 <- DataframeToArray(variable1)
- variable 1 must be in the same format as the result when downloading.
- first column (id) is ignored
- variable1 <- function(variable1) is allowed
Other useful stuff
as.data.frame(as.table(array)) dataframe[is.na(dataframe[,ncol(dataframe)])==FALSE,]
- First line returns an array from a data frame.
- Second line returns rows without empty values.
dataframe[grep("location", dataframe$index, ignore.case = TRUE),]
- Returns all rows of a data frame where column "index" value is "location".
Uploading data
Tested and works. Cell writing is efficient with even with huge data, index writing requires more work. Probabilistic data not supported.
Low level
library(utils) library(RODBC) dbSimpleRead <- function(db, table, column, filtercol, filter) { sqlQuery(db, paste('SELECT ', column, ' FROM ', table, ' WHERE ', filtercol, ' = "', filter, '"', sep = ))[1,1] } dbLessSimpleRead <- function(db, table, column, filtercol1, filter1, filtercol2, filter2) { sqlQuery(db, paste('SELECT ', column, ' FROM ', table, ' WHERE ', filtercol1, ' = "', filter1, '" AND ', filtercol2, ' = "', filter2, '"', sep = ))[1,1] } SimplePrompt <- function(question) { readline(paste(question, "\n", sep = "")) }
High level
op_baseWriteArray <- function(db, array, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL) { originaldataframe <- as.data.frame(as.table(array)) values <- originaldataframe[,"Freq"] cond <- (is.na(values) == FALSE) dataframe <- originaldataframe[cond,] #Add page to database (if it doesn't already exist) if (is.null(ident)==TRUE) if (interactive()) ident <- SimplePrompt("What is the identifier of this object?") wiki_name <- substring(ident, 1, 5) if (wiki_name=="Op_en") {wiki_id <- 1; page <- substring(ident, 6, nchar(ident))} else if (wiki_name=="Op_fi") {wiki_id <- 2; page <- substring(ident, 6, nchar(ident))} else {wiki_id <- 0; page <- 0} obj_id <- dbSimpleRead(db, "obj", "id", "ident", ident) if (is.na(obj_id)==TRUE) { if (is.null(name)==TRUE) if (interactive()) name <- SimplePrompt("What is the name of this object?") else error if (is.null(unit)==TRUE) if (interactive()) unit <- SimplePrompt("What is the unit of this object?") else error if (is.null(objtype_id)==TRUE) if (interactive()) objtype_id <- SimplePrompt(paste("What type of object is this (id)?", paste(paste(sqlQuery(db, "SELECT id FROM objtype")[,1], sqlQuery(db, "SELECT objtype FROM objtype")[,1], sep = " - "), collapse = ", "), collapse = " ")) else error sqlQuery(db, paste('INSERT INTO obj (ident, name, unit, objtype_id, page, wiki_id) VALUES ("', paste(ident, name, unit, sep = '","'), '",', paste(objtype_id, page, wiki_id, sep = ','), ')', sep = )) obj_id <- dbSimpleRead(db, "obj", "id", "ident", ident) } #Write act and actobj if (is.null(who)==TRUE) if (interactive()) who <- SimplePrompt("What is the name of the uploader?") else error sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (4, "', who, '","R upload")', sep = )) act_id <- sqlQuery(db, "SELECT id FROM act ORDER BY id DESC LIMIT 1")[1,1] sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id) VALUES (', paste(act_id, obj_id, act_id, sep = ','), ')', sep = )) actobj_id <- dbSimpleRead(db, "actobj", "id", "act_id", act_id) #Write indexes and locations x <- 1 DimIds <- list() IndId <- 0 Ind <- 0 loc <- 0 LocId <- 0 for (i in 1:(ncol(dataframe) - x)) { DimIds[[i]] <- rep(NA, length(levels(factor(dataframe[,i])))) IndId <- dbSimpleRead(db, "obj", "id", "ident", dimnames(dataframe)[[2]][i]) if (is.na(IndId)==TRUE) { sqlQuery(db, paste('INSERT INTO obj (ident, name, objtype_id) VALUES ("', paste(dimnames(dataframe)[[2]][i], dimnames(dataframe)[[2]][i], sep = '","'), '",', 6, ')', sep = '')) IndId <- dbSimpleRead(db, "obj", "id", "ident", dimnames(dataframe)[[2]][i]) } for (j in 1:length(levels(factor(dataframe[,i])))) { #1:length(dimnames(array)[[i]])) { LocId <- dbLessSimpleRead(db, "loc", "id", "location", levels(factor(dataframe[,i]))[j], "obj_id_i", IndId) if (is.na(LocId)==TRUE) { sqlQuery(db, paste('INSERT INTO loc (obj_id_i, location) VALUES (', IndId, ',"', levels(factor(dataframe[,i]))[j], '")', sep = '')) LocId <- dbLessSimpleRead(db, "loc", "id", "location", levels(factor(dataframe[,i]))[j], "obj_id_i", IndId) } DimIds[[i]][j] <- LocId } names(DimIds)[i] <- IndId } # Changing location names in table into ids for (i in 1:(ncol(dataframe)-1)) { dataframe[,i] <- factor(dataframe[,i]) levels(dataframe[,i]) <- DimIds[[i]] } maxrows <- 5000 #Writing cell n <- 1 cellQuery <- NA cellQuery[1:nrow(dataframe)*6-5] <- actobj_id cellQuery[1:nrow(dataframe)*6-4] <- "," cellQuery[1:nrow(dataframe)*6-3] <- dataframe[,"Freq"] cellQuery[1:nrow(dataframe)*6-2] <- "," cellQuery[1:nrow(dataframe)*6-1] <- n cellQuery[1:(nrow(dataframe)-1)*6] <- "),(" if (nrow(dataframe)>=maxrows) { for (i in 1:(nrow(dataframe)%/%maxrows)) { sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[((i-1)*6*maxrows+1):(i*6*maxrows-1)], collapse = ), ')', sep = )) } } sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[(nrow(dataframe)%/%maxrows*6*maxrows+1):length(cellQuery)], collapse = ), ')', sep = )) #Writing res cell_id <- rev(sqlQuery(db, paste("SELECT id FROM cell WHERE actobj_id = ", actobj_id, " ORDER BY ID DESC LIMIT ", nrow(dataframe), sep = ""))[,1]) resQuery <- NA resQuery[1:length(cell_id)*4-3] <- cell_id resQuery[1:length(cell_id)*4-2] <- "," resQuery[1:length(cell_id)*4-1] <- dataframe[,"Freq"] resQuery[1:(length(cell_id)-1)*4] <- "),(" if (length(cell_id)>=maxrows) { for (i in 1:(length(cell_id)%/%maxrows)) { sqlQuery(db, paste('INSERT INTO res (cell_id, result) VALUES (', paste(resQuery[((i-1)*4*maxrows+1):(i*4*maxrows-1)], collapse = ), ')', sep = )) } } sqlQuery(db, paste('INSERT INTO res (cell_id, result) VALUES (', paste(resQuery[(length(cell_id)%/%maxrows*4*maxrows+1):length(resQuery)], collapse = ), ')', sep = )) #Writing loccell locidmatrix <- rep(NA, (ncol(dataframe)-1)*length(cell_id)) dim(locidmatrix) <- c(ncol(dataframe)-1,length(cell_id)) for (i in 1:(ncol(dataframe)-1)) { locidmatrix[i,] <- as.character(dataframe[,i]) } loccellQuery <- rep(NA, 4*(ncol(dataframe)-1)*length(cell_id)) dim(loccellQuery) <- c(4*(ncol(dataframe)-1), length(cell_id)) loccellQuery[1:(ncol(dataframe)-1)*4-3, 1:length(cell_id)] <- rep(cell_id, each = ncol(dataframe) - 1) loccellQuery[1:(ncol(dataframe)-1)*4-2, 1:length(cell_id)] <- "," loccellQuery[1:(ncol(dataframe)-1)*4-1, 1:length(cell_id)] <- locidmatrix loccellQuery[1:(ncol(dataframe)-1)*4, 1:length(cell_id)] <- "),(" loccellQuery[(ncol(dataframe)-1)*4, length(cell_id)] <- "" if (length(cell_id)>=maxrows) { for (i in 1:(length(cell_id)%/%maxrows)) { loccellQuery[(ncol(dataframe)-1)*4, i*maxrows] <- "" sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,((i-1)*maxrows+1):(i*maxrows)], collapse = ), ')', sep = )) } } sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,(length(cell_id)%/%maxrows*maxrows+1):length(cell_id)], collapse = ), ')', sep = )) }
Usage
op_baseWriteArray(db, array, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL)
Db and array must be defined, the rest if not defined is prompted for by the function.