Opasnet Base Connection for R: Difference between revisions
Jump to navigation
Jump to search
m (→Uploading data) |
|||
Line 17: | Line 17: | ||
<nowiki> | <nowiki> | ||
op_baseGetData <- function(dsn, ident, include = NULL, exclude = NULL, series_id = NULL, iterations = NULL, use.utf8 = TRUE) { | op_baseGetData <- function(dsn, ident, include = NULL, exclude = NULL, series_id = NULL, iterations = NULL, use.utf8 = TRUE, utf8.conv = TRUE) { | ||
if (use.utf8) db <- odbcConnect(dsn, DBMSencoding = "UTF-8") else db <- odbcConnect(dsn) | if (use.utf8) db <- odbcConnect(dsn, DBMSencoding = "UTF-8") else db <- odbcConnect(dsn) | ||
obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1] | obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1] |
Revision as of 11:44, 27 June 2011
Moderator:Teemu R (see all) |
|
Upload data
|
Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code you need to the R console; this defines the functions, after which they can be called for in that R session. Or alternatively install the OpasnetBaseUtils package.
Package dependencies
These packages are required for most of the code to work. To install: from the top bar menu Packages>Install. To load: copy-paste.
library(RODBC)
Downloading data
op_baseGetData <- function(dsn, ident, include = NULL, exclude = NULL, series_id = NULL, iterations = NULL, use.utf8 = TRUE, utf8.conv = TRUE) { if (use.utf8) db <- odbcConnect(dsn, DBMSencoding = "UTF-8") else db <- odbcConnect(dsn) obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1] if (length(series_id) == 0) {series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, ' ORDER BY series_id DESC LIMIT 1', sep = ''))[1,1]} sliced <- FALSE locations <- NULL x <- 1 basequery <- paste('SELECT loccell.cell_id FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN', ' loccell ON cell.id = loccell.cell_id WHERE actobj.obj_id = ', obj_id, ' AND actobj.series_id = ', series_id, ' AND loccell.loc_id IN(', sep = '') if (length(include) != 0) { sliced <- TRUE locations[x] <- paste("IN(", basequery, paste(include, collapse = ","), ")", sep = "") x <- x + 1 } if (length(exclude) != 0) { sliced <- TRUE locations[x] <- paste("NOT IN(", basequery, paste(exclude, collapse = ","), ")", sep = "") } if (sliced == FALSE) { Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.ident AS ind, loc.location AS loc, res.result,', ' res.restext FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', ' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id', ' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, ' AND actobj.series_id = ', series_id, if(length(iterations)==1){paste(" AND obs <= ", iterations, sep = "")}, sep = '')) } else { Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.ident AS ind, loc.location AS loc, res.result,', ' res.restext FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', ' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id', ' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, ' AND actobj.series_id = ', series_id, if(length(iterations)==1){paste(" AND obs <= ", iterations, sep = "")}, ' AND (cell.id ', paste(locations, collapse = ') AND cell.id '), '))', sep = '')) } odbcClose(db) Data <- Data[order(Data[,1], Data[,2], Data[,3]),] nind <- length(levels(Data[,3])) nres <- nrow(Data)/nind dataframe <- Data[1:nres*nind, c(1,2)] for (i in 1:nind) { dataframe[,2 + i] <- factor(Data[1:nres*nind - (nind - i), 4]) levels(dataframe[,2 + i]) <- gsub(" *$", "",gsub("^ *", "", levels(dataframe[,2 + i]))) colnames(dataframe)[2 + i] <- as.character(Data[i, 3]) } dataframe[,1:2 + 2 + nind] <- Data[1:nres*nind, 5:6] colnames(dataframe)[1:2 + 2 + nind] <- c("Result", "Result.Text") rownames(dataframe) <- 1:nres return(dataframe) }
Usage
variable <- op_baseGetData("opasnet_base", "page identifier", include = vector_of_loc_ids, exclude = vector_of_loc_ids)
- Assuming "opasnet_base" is a correctly defined DSN (Data Service Name; in Windows XP: Control Panel\Administrative tools\Data Sources (ODBC)).
- Include and exclude are optional.
- Include picks all cells in the locations given.
- The clearest case is when all the included locations belong to the same index: Any cells in the non-included locations of the index will be left out.
- In case given locations are in multiple indices: The effect produced will be the same as picking separately for each index and removing duplicates.
- Exclude unpicks any cells which are indexed by the locations given. Slower than include.
- They can be used in unison.
- Include picks all cells in the locations given.
- Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result.
- series_id is an optional parameter, if it is not given the most current upload of the data will be downloaded.
Finding index data
op_baseGetLocs <- function(dsn, ident, series_id = NULL, use.utf8 = TRUE) { if (use.utf8) db <- odbcConnect(dsn, DBMSencoding = "UTF-8") else db <- odbcConnect(dsn) obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1] if (length(series_id) == 0) {series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, ' ORDER BY series_id DESC LIMIT 1', sep = ''))[1,1]} Locs <- sqlQuery(db, paste("SELECT DISTINCT obj.ident AS ind, loc.location AS loc, loc.id AS loc_id", " FROM actobj LEFT JOIN actloc ON actobj.id = actloc.actobj_id LEFT JOIN loc ON actloc.loc_id = loc.id", " LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ", obj_id, " AND actobj.series_id = ", series_id, sep = "")) odbcClose(db) Locs <- Locs[order(Locs[,1]),] rownames(Locs) <- 1:nrow(Locs) return(Locs) }
- Returns all indexes and locations and their ids in a table of format: ind, loc, loc_id.
Manipulating data
DataframeToArray <- function(dataframe, rescol = NULL) { ColNames <- colnames(dataframe[(colnames(dataframe) == "id") == FALSE]) if (length(ColNames[(ColNames == "obs")])>0) {if(length(levels(factor(dataframe[,"obs"]))) == 1) {ColNames <- ColNames[ (ColNames == "obs") == FALSE]} else {dataframe[,"obs"] <- factor(as.character(dataframe[,"obs"]))}} if (length(rescol)==0) { rescol <- colnames(dataframe) == "Freq" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Freq" else { rescol <- colnames(dataframe) == "Result" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Result" else { rescol <- colnames(dataframe) == "result" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "result" else stop("No result column found") } } } else {ColNames <- ColNames[(ColNames == rescol) == FALSE]} ColNames <- ColNames[(ColNames == "Result") == FALSE] ColNames <- ColNames[(ColNames == "result") == FALSE] ColNames <- ColNames[(ColNames == "Freq") == FALSE] nind <- length(ColNames) DimNames <- rep(vector("list", 1), nind) names(DimNames) <- ColNames indlengths <- 0 for (i in 1:nind) { DimNames[[i]] <- levels(factor(dataframe[,ColNames[i]])) indlengths[i] <- length(DimNames[[i]]) } array <- rep(NA, prod(indlengths)) dim(array) <- indlengths dimnames(array) <- DimNames array[as.matrix(dataframe[,ColNames])] <- dataframe[,rescol] return(array) }
Usage
variable2 <- DataframeToArray(variable1, rescol = NULL)
- variable1 must be in similar format as the result when downloading.
- Columns named "id" and various versions of "Result" are ignored for dimension creation.
- "obs" column will also be ignored if there's only one.
- The column containing the values may be defined in the parameters, otherwise it is assumed to be either "Freq", "Result" or "result" in that order.
Uploading data
op_baseWrite <- function(dsn, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL, acttype = NULL, rescol = NULL, n.obs.const = FALSE, maxrows = 50000, use.utf8 = TRUE) { # Coerce input into a data frame if it isn't one already; get rid of empty cells if (is.array(input)) dataframe <- as.data.frame(as.table(input)) else dataframe <- input if (is.null(rescol)) { rescol <- colnames(dataframe) == "Freq" if (sum(rescol) == 1) rescol <- "Freq" else { rescol <- colnames(dataframe) == "Result" if (sum(rescol) == 1) rescol <- "Result" else { rescol <- colnames(dataframe) == "result" if (sum(rescol) == 1) rescol <- "result" } }} dataframe <- dataframe[is.na(dataframe[,rescol]) == FALSE,] ColNames <- colnames(dataframe)[!(colnames(dataframe)%in%c(rescol, "id", "obs"))] for (i in ColNames) { dataframe[,i] <- factor(dataframe[,i]) levels(dataframe[,i]) <- gsub(" *$", "",gsub("^ *", "", levels(dataframe[,i]))) if(use.utf8) if(sum(Encoding(levels(dataframe[,i]))=="latin1")!=0) levels(dataframe[,i]) <- iconv(levels(dataframe[,i]), "latin1", "UTF-8") } #if(!is.numeric(dataframe[,rescol])) # Open database connection if(use.utf8) db <- odbcConnect(dsn, DBMSencoding = "UTF-8") else db <- odbcConnect(dsn) # Add page to database (if it doesn't already exist) if (is.null(ident)) if (interactive()) ident <- readline(paste("What is the identifier of this object?", "\n", sep = "")) else stop("indentifier of object no given") obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1] if (is.na(obj_id)) { # Wiki id if (substr(ident, 1,5)=="Op_en") {wiki_id <- 1; page <- substr(ident, 6, nchar(ident))} else { if (substr(ident, 1,5)=="Op_fi") {wiki_id <- 2; page <- substr(ident, 6, nchar(ident))} else { if (substr(ident, 1,6)=="Heande") {wiki_id <- 6; page <- substr(ident, 7, nchar(ident))} else { if (substr(ident, 1,4)=="Erac") {wiki_id <- 6; page <- substr(ident, 5, nchar(ident))} else { wiki_id <- 0; page <- 0; warning("No wiki id found in ident, writing zero.")}}}} page <- as.numeric(page) if (is.na(page)) stop("could not convert characters following the wiki ident into a page number") # Name etc. if (is.null(name)) if (interactive()) name <- readline(paste("What is the name of this object?", "\n", sep = "")) else stop("object name not given") if (is.null(objtype_id)) if (interactive()) objtype_id <- readline(paste("What type of object is", " this (id)?", paste(paste(sqlQuery(db, "SELECT id FROM objtype")[,1], sqlQuery(db, paste("SELECT objtype", " FROM objtype", sep = ""))[,1], sep = " - "), collapse = ", "), "\n", collapse = " ")) else { stop("object type not given")} sqlQuery(db, paste('INSERT INTO obj (ident, name, objtype_id, page, wiki_id) VALUES ("', paste(ident, name, sep = '","'), '",', paste(objtype_id, page, wiki_id, sep = ','), ')', sep = '')) obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1] } # Write act and actobj if (is.null(who)==TRUE) if (interactive()) {who <- readline(paste("What is the name of the uploader?", "\n", sep = "")) } else stop("uploader name not given") series_id <- sqlQuery(db, paste("SELECT series_id FROM actobj WHERE obj_id = ", obj_id, " ORDER BY series_id DESC LIMIT 1", sep = ""))[1,1] if (is.na(series_id)==FALSE) {if (is.null(acttype)==TRUE) {if (interactive()) {acttype <- readline(paste("What type of upload", " is this? 4 - new data to replace any existing, 5 - new data to be appended to existing data (must have the same", " indices).", "\n", sep = "")) } else acttype <- 4 }} else acttype <- 4 if (!(acttype%in%c(4,5))) stop ("proper acttype not given") sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (', acttype, ',"', who, '","R upload")', sep = '')) act_id <- sqlQuery(db, paste('SELECT id FROM act WHERE who = "', who,'" AND comments = "R upload" ORDER BY id DESC LIMIT 1', sep = ''))[1,1] if (acttype == 4) series_id <- act_id if (is.null(unit)) if (interactive()) unit <- readline(paste("What is the unit of this object?", "\n", sep = "")) else stop("unit not given") sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id, unit) VALUES (', paste(act_id, obj_id, series_id, sep = ','), ',"', unit, '")', sep = '')) actobj_id <- sqlQuery(db, paste('SELECT id FROM actobj WHERE act_id = ', act_id, sep = ''))[1,1] #Write indexes for (i in ColNames) { sqlQuery(db, paste('INSERT IGNORE INTO obj (ident, name, objtype_id) VALUES ("', gsub(' ', '_', i), '","', i, '", 6)', sep = '')) } IndIds <- sqlQuery(db, paste('SELECT id, ident FROM obj WHERE ident IN("', paste(gsub(" ", "_", ColNames), collapse = '","'), '")', sep = '')) IndIdMap <- IndIds$id names(IndIdMap) <- tolower(IndIds$ident) ColIds <- as.character(IndIdMap[tolower(gsub(" ", "_", ColNames))]) colnames(dataframe)[colnames(dataframe)%in%ColNames] <- ColIds #Write locations for (i in ColIds) { for (j in levels(dataframe[, i])) { sqlQuery(db, paste('INSERT IGNORE INTO loc (obj_id_i, location) VALUES (', i, ',"', j, '")', sep = '')) } } LocIds <- sqlQuery(db, paste('SELECT id, obj_id_i, location FROM loc WHERE obj_id_i IN("', paste(ColIds, collapse = '","'), '")', sep = '')) for (i in ColIds) { LocIdMap <- LocIds[LocIds$obj_id_i == i, 1] names(LocIdMap) <- gsub(" *$", "",gsub("^ *", "", tolower(LocIds[LocIds$obj_id_i == i, 3]))) levels(dataframe[, i]) <- LocIdMap[tolower(levels(dataframe[, i]))] if (sum(is.na(levels(dataframe[, i]))) != 0) stop("Faulty location matching. Usually caused by special characters.") #Writing actloc sqlQuery(db, paste("INSERT INTO actloc (actobj_id, loc_id) VALUES (", paste(actobj_id, levels(dataframe[, i]), sep = ",", collapse = "),("), ")", sep = "")) } #Writing cell n <- tapply(dataframe[,rescol], dataframe[,ColIds], length) ncell <- sum(n, na.rm = TRUE) if (is.numeric(dataframe[,rescol])) means <- tapply(dataframe[,rescol], dataframe[,ColIds], mean) else means <- rep(0, ncell) if (is.numeric(dataframe[,rescol])) { sds <- tapply(dataframe[,rescol], dataframe[,ColIds], sd); sds[] <- ifelse(n == 1, 0, sds)} else sds <- rep(0, ncell) cellQuery <- paste(actobj_id, means[!is.na(means)], sds[!is.na(sds)], n[!is.na(n)], sep = ",") i <- 1 while (length(cellQuery) >= (i + maxrows - 1)) { sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, sd, n) VALUES (', paste(cellQuery[i:(i + maxrows - 1)], collapse = '),('), ')', sep = '')) i <- i + maxrows } if (length(cellQuery) %% maxrows != 0) { sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, sd, n) VALUES (', paste(cellQuery[i:length(cellQuery)], collapse = '),('), ')', sep = '')) } #Writing res cell_id <- sqlQuery(db, paste("SELECT id FROM cell WHERE actobj_id = ", actobj_id, " ORDER BY ID", sep = ""))[,1] if (length(cell_id) != ncell) stop("number of written cells differs from given data") if (is.numeric(dataframe[,rescol])) ids <- means else ids <- n ids[!is.na(ids)] <- cell_id dataframe[, ncol(dataframe) + 1] <- ids[as.matrix(dataframe[,ColIds])] colnames(dataframe)[ncol(dataframe)] <- "cell_id" resQuery <- paste(dataframe[,"cell_id"], ',', if(sum(colnames(dataframe) == "obs") == 0) 1 else dataframe[,"obs"], ',', if (!is.numeric(dataframe[,rescol])) '"', dataframe[,rescol], if (!is.numeric(dataframe[,rescol])) '"', sep = "") i <- 1 while (length(resQuery) >= (i + maxrows - 1)) { sqlQuery(db, paste('INSERT INTO res (cell_id, obs, ', ifelse(is.numeric(dataframe[,rescol]), "result", "restext"), ') VALUES (', paste(resQuery[i:(i + maxrows - 1)], collapse = '),('), ')', sep = '')) i <- i + maxrows } if (length(resQuery) %% maxrows != 0) { sqlQuery(db, paste('INSERT INTO res (cell_id, obs, ', ifelse(is.numeric(dataframe[,rescol]), "result", "restext"), ') VALUES (', paste(resQuery[i:length(resQuery)], collapse = '),('), ')', sep = '')) } #Writing loccell ids <- as.data.frame(as.table(ids)) ids <- ids[!is.na(ids$Freq),] loccellQuery <- paste(ids$Freq, unlist(ids[,-ncol(ids)]), sep = ",") i <- 1 while (length(loccellQuery) >= (i + maxrows - 1)) { sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[i:(i + maxrows - 1)], collapse = '),('), ')', sep = '')) i <- i + maxrows } if (length(loccellQuery) %% maxrows != 0) { sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[i:length(loccellQuery)], collapse = '),('), ')', sep = '')) } #Close database connection odbcClose(db) cat("Successful\n") return(character()) }
Usage
op_baseWrite(dsn, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL, acttype = NULL, rescol = NULL)
- dsn and input must be defined, the rest of the object and act parameters if not defined are prompted for by the function as needed.
- For uploading the DSN defined must have writers permissions.
- rescol defines the column from which the values are chosen from, both numerical and textual data are allowed, if left undefined the function will check column matches for "Freq", "Result" and "result" in that order.
Restrictions
- Input may only be given in either array or data.frame form.
- Indexes used may not exceed the character limit of 20.
- Indexes should preferably match an earlier entry: Special:OpasnetBaseIndices.
- Indexes are treated as identifiers for indexes in the database, spaces in the indexes are converted to _. This ensures maximum compatibility and ease in operations in which data is downloaded and uploaded again. Names and more specific details can be edited into the indexes separately.
- Indexes used may not exceed the character limit of 20.