Opasnet Base Connection for R

From Opasnet
Revision as of 11:12, 6 October 2010 by Teemu R (talk | contribs)
Jump to navigation Jump to search


Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code to R; this defines the functions, after which they can be called for in R.

Package dependencies

These packages are required for most of the code to work. Load these first (copy-pasteable).

library(utils)
library(RODBC)

Setup

Establishes the connection to the database. Copy paste first.

db <- odbcConnect("opasnet_base") 
  • Assuming "opasnet_base" is a correctly defined DSN.
    • Note that uploading requires writing permissions.

Downloading data

Functions

  • Now simplified, a lot faster. Copy paste second.
op_baseGetData <- function(ident) {
	obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1]
	series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, 
		' ORDER BY series_id DESC LIMIT 1'))[1,1]
	Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.name AS ind, loc.location AS loc, res.result', 
		' FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', 
		' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id',
		' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, 
		' AND actobj.series_id = ', series_id, ' ORDER BY cell.id, res.obs, obj.name', sep = ''))
	nind <- length(levels(Data[,3]))
	ncell <- nrow(Data)/nind
	dataframe <- data.frame(NA)
	dataframe <- Data[1:ncell*nind, c(1,2)]
	for (i in 1:nind) {
		dataframe[,2 + i] <- Data[1:ncell*nind - i + 1, 4]
		colnames(dataframe)[2 + i] <- as.character(Data[nind - i + 1, 3])
	}
	dataframe[,3 + nind] <- Data[1:ncell*nind, 5]
	colnames(dataframe)[3 + nind] <- "Result"
	rownames(dataframe) <- 1:ncell
	dataframe
}

Usage

variable <- op_baseGetData("page identifier")
  • Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result.

Manipulating data

Functions

DataframeToArray <- function(dataframe) {
	DimNames <- rep(vector("list", 1), ncol(dataframe) - 2)
	names(DimNames) <- c(colnames(dataframe)[2:(ncol(dataframe) - 1)])
	nind <- length(names(DimNames))
	indlengths <- 0
	for (i in 1:nind) {
		DimNames[[i]] <- levels(factor(dataframe[,1 + i]))
		indlengths[i] <- length(DimNames[[i]])
	}
	array <- rep(NA, prod(indlengths))
	dim(array) <- indlengths
	dimnames(array) <- DimNames
	array[as.matrix(dataframe[,c(1:nind + 1)])] <- dataframe[,ncol(dataframe)]
	array
}

Usage

variable2 <- DataframeToArray(variable1)
  • variable 1 must be in the same format as the result when downloading.
    • first column (id) is ignored
  • variable1 <- function(variable1) is allowed

Other useful stuff

as.data.frame(as.table(array))
dataframe[is.na(dataframe[,ncol(dataframe)])==FALSE,]
  • First line returns an array from a data frame.
  • Second line returns rows without empty values.
dataframe[grep("location", dataframe$index, ignore.case = TRUE),]
  • Returns all rows of a data frame where column "index" value is "location".

Uploading data

Slight problems with probabilistic data, sorting orders do not match between read and write. Very efficient. Probabilistic data supported. Appending to existing data not supported, a whole new set will always be produced. Varying numbers of iterations between cells not supported.

Functions

op_baseWrite <- function(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL) {
	
	#Coerce input into a data frame if it isn't one already; get rid of empty cells
	
	if (is.array(input)) dataframe <- as.data.frame(as.table(input)) else dataframe <- input
	rescol <- colnames(dataframe) == "Freq"
	if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Freq" else {
		rescol <- colnames(dataframe) == "Result"
		if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Result" else {
			rescol <- colnames(dataframe) == "result"
			if (length(rescol[rescol==TRUE]) >= 1) rescol <- "result"
		}
	}
	values <- dataframe[,rescol]
	cond <- (is.na(values) == FALSE)
	dataframe <- dataframe[cond,]
	
	#Add page to database (if it doesn't already exist)
	
	if (is.null(ident)==TRUE) if (interactive()) ident <- readline(paste("What is the identifier of this object?", 
		"\n", sep = "")) else stop("indentifier of object no given")
	wiki_name <- substring(ident, 1, 5)
	if (wiki_name=="Op_en") {wiki_id <- 1; page <- substring(ident, 6, nchar(ident))} else if (wiki_name=="Op_fi") {
		wiki_id <- 2; page <- substring(ident, 6, nchar(ident))} else {wiki_id <- 0; page <- 0}
	obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1]
	if (is.na(obj_id)==TRUE) {
		if (is.null(name)==TRUE) if (interactive()) name <- readline(paste("What is the name of this object?", 
			"\n", sep = "")) else stop("object name not given")
		if (is.null(unit)==TRUE) if (interactive()) unit <- readline(paste("What is the unit of this object?", 
			"\n", sep = "")) else stop("unit not given")
		if (is.null(objtype_id)==TRUE) if (interactive()) objtype_id <- readline(paste("What type of object is", 
			" this (id)?", paste(paste(sqlQuery(db, "SELECT id FROM objtype")[,1], sqlQuery(db, "SELECT objtype",
			" FROM objtype")[,1], sep = " - "), collapse = ", "), "\n", collapse = " ")) else {
			stop("object type not given")}
		sqlQuery(db, paste('INSERT INTO obj (ident, name, unit, objtype_id, page, wiki_id) VALUES ("', paste(ident, 
			name, unit, sep = '","'), '",', paste(objtype_id, page, wiki_id, sep = ','), ')', sep = ''))
		obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1]
	}
	
	#Write act and actobj
	
	if (is.null(who)==TRUE) if (interactive()) {who <- readline(paste("What is the name of the uploader?", "\n", sep = "")) 
		} else stop("uploader name not given")
	sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (4, "', who, '","R upload")', sep = ''))
	act_id <- sqlQuery(db, "SELECT id FROM act ORDER BY id DESC LIMIT 1")[1,1]
	sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id) VALUES (', paste(act_id, obj_id, act_id, sep = ','), 
		')', sep = ''))
	actobj_id <- sqlQuery(db, paste('SELECT id FROM actobj WHERE act_id = ', act_id, sep = ''))[1,1]
	
	#Write indexes
	
	ColNames <- colnames(dataframe[(colnames(dataframe) == "id") == FALSE])
	ColNames <- ColNames[(ColNames == "obs") == FALSE]
	ColNames <- ColNames[(ColNames == "Result") == FALSE]
	ColNames <- ColNames[(ColNames == "result") == FALSE]
	ColNames <- ColNames[(ColNames == "Freq") == FALSE]
	nind <- length(ColNames)
	DimNames <- rep(vector("list", 1), nind)
	names(DimNames) <- ColNames
	indlengths <- 0
	for (i in 1:nind) {
		DimNames[[i]] <- levels(factor(dataframe[, ColNames[i]]))
		indlengths[i] <- length(DimNames[[i]])
	}
	
	indQuery <- NA
	indQuery[1:nind*5 - 4] <- gsub(" ", "_", ColNames)
	indQuery[1:nind*5 - 3] <- '","'
	indQuery[1:nind*5 - 2] <- ColNames
	indQuery[1:nind*5 - 1] <- '",6'
	indQuery[1:(nind - 1)*5] <- '),("'
	sqlQuery(db, paste('INSERT IGNORE INTO obj (ident, name, objtype_id) VALUES ("', paste(indQuery, collapse = ), ')', 
		sep = ''))
	
	IndIds <- sqlQuery(db, paste('SELECT id, ident FROM obj WHERE ident IN("', paste(gsub(" ", "_", ColNames), 
		collapse = '","'), '")', sep = ''))
	DimIds <- DimNames
	DimN <- 1:nind
	names(DimN) <- gsub(" ", "_", ColNames)
	for (i in 1:nrow(IndIds)) {
		names(DimIds)[DimN[IndIds[i,2]]] <- IndIds[grep(gsub(" ", "_", ColNames[i]), IndIds$ident), 1]
	}
	
	#Write locations
	
	y <- 1
	LocNames <- as.data.frame(matrix(rep(NA, 2*sum(indlengths)), sum(indlengths), 2))
	for (i in 1:nind) {
		LocNames[y:(y + indlengths[i] - 1),1:2] <- matrix(c(rep(ColNames[i], indlengths[i]), 
			levels(factor(dataframe[,ColNames[i]]))), indlengths[i], 2)
		y <- y + indlengths[i]
	}
	LocNames[,1] <- names(DimIds)[DimN[LocNames[,1]]]
	
	nloc <- nrow(LocNames)
	locQuery <- NA
	locQuery[1:nloc*4 - 3] <- LocNames[,1]
	locQuery[1:nloc*4 - 2] <- ',"'
	locQuery[1:nloc*4 - 1] <- LocNames[,2]
	locQuery[1:(nloc - 1)*4] <- '"),('
	sqlQuery(db, paste('INSERT IGNORE INTO loc (obj_id_i, location) VALUES (', paste(locQuery, collapse = ), '")', 
		sep = ''))
	
	LocIds <- sqlQuery(db, paste('SELECT id, obj_id_i, location FROM loc WHERE location IN("', paste(LocNames[,2], 
		collapse = '","'), '") AND obj_id_i IN(', paste(names(DimIds), collapse = ','), ')', sep = ''))
	LocMap <- NA
	y <- 1
	for (i in 1:nind) {
		LocMap <- LocIds[grep(names(DimIds)[i], LocIds$obj_id_i), 1]
		names(LocMap) <- LocIds[grep(names(DimIds)[i], LocIds$obj_id_i), 3]
		DimIds[[i]] <- LocMap[DimNames[[i]]]
	}
	
	#Changing location names in table into ids
	
	for (i in 1:nind) {
		dataframe[,ColNames[i]] <- factor(dataframe[,ColNames[i]])
		levels(dataframe[,ColNames[i]]) <- DimIds[[i]]
	}
	
	#A hidden parameter for adjusting query packet sizes, the higher the faster, though crash becomes likelier
	
	maxrows <- 50000
	
	#Writing cell
	
	obscol <- colnames(dataframe) == "obs"
	if (length(obscol[obscol==TRUE]) >= 1) {
		obscol <- TRUE
		n <- length(levels(factor(dataframe[,"obs"])))} else {
		obscol <- FALSE
		n <- 1
	}
	ncell <- nrow(dataframe)/n
	cellQuery <- NA
	cellQuery[1:ncell*6-5] <- actobj_id
	cellQuery[1:ncell*6-4] <- ","
	if (n == 1) {
		cellQuery[1:ncell*6-3] <- dataframe[,rescol]} else {
		cellQuery[1:ncell*6-3] <- apply(matrix(dataframe[,rescol], n, ncell), 2, mean)
	}
	cellQuery[1:ncell*6-2] <- ","
	cellQuery[1:ncell*6-1] <- n
	cellQuery[1:(ncell-1)*6] <- "),("
	if (ncell>=maxrows) {
		for (i in 1:(ncell%/%maxrows)) {
			sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[
				((i-1)*6*maxrows+1):(i*6*maxrows-1)], collapse = ''), ')', sep = ''))
		}
	}
	sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[(ncell%/%
		maxrows*6*maxrows+1):length(cellQuery)], collapse = ''), ')', sep = ''))
	
	#Writing res
	
	cell_id <- sqlQuery(db, paste("SELECT id FROM cell WHERE actobj_id = ", actobj_id, " ORDER BY ID", sep = ""))[,1]
	if (ncell != length(cell_id)) stop("Number of cells written does not match with expected value")
	resQuery <- NA
	resQuery[1:nrow(dataframe)*6-5] <- rep(cell_id, n)
	resQuery[1:nrow(dataframe)*6-4] <- ","
	if (n==1) resQuery[1:nrow(dataframe)*6-3] <- "0" else resQuery[1:nrow(dataframe)*6-3] <- dataframe[,"obs"]
	resQuery[1:nrow(dataframe)*6-2] <- ","
	resQuery[1:nrow(dataframe)*6-1] <- dataframe[,rescol]
	resQuery[1:(nrow(dataframe)-1)*6] <- "),("
	if (nrow(dataframe)>=maxrows) {
		for (i in 1:(nrow(dataframe)%/%maxrows)) {
			sqlQuery(db, paste('INSERT INTO res (cell_id, obs, result) VALUES (', paste(resQuery[((i-1)
				*4*maxrows+1):(i*4*maxrows-1)], collapse = ''), ')', sep = ''))
		}
	}
	sqlQuery(db, paste('INSERT INTO res (cell_id, obs, result) VALUES (', paste(resQuery[(nrow(dataframe)%/%maxrows
		*4*maxrows+1):length(resQuery)], collapse = ''), ')', sep = ''))
	
	#Writing loccell
	
	locidmatrix <- rep(NA, nind*ncell)
	dim(locidmatrix) <- c(nind,ncell)
	for (i in 1:nind) {
		locidmatrix[i,] <- as.character(dataframe[1:ncell*n - n + 1,ColNames[i]])
	}
	loccellQuery <- rep(NA, 4*nind*ncell)
	dim(loccellQuery) <- c(4*nind, ncell)
	loccellQuery[1:nind*4-3, 1:ncell] <- rep(cell_id, each = nind)
	loccellQuery[1:nind*4-2, 1:ncell] <- ","
	loccellQuery[1:nind*4-1, 1:ncell] <- locidmatrix
	loccellQuery[1:nind*4, 1:ncell] <- "),("
	loccellQuery[nind*4, ncell] <- ""
	if (ncell>=maxrows) {
		for (i in 1:(ncell%/%maxrows)) {
			loccellQuery[nind*4, i*maxrows] <- ""
			sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,
				((i-1)*maxrows+1):(i*maxrows)], collapse = ''), ')', sep = ''))
		}
	}
	sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,(ncell%/%
		maxrows*maxrows+1):ncell], collapse = ''), ')', sep = ''))
}

Usage

op_baseWrite(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL)
  • Db and input must be defined, the rest if not defined is prompted for by the function.
  • Input may be given in either array or data.frame form.
    • Input in data.frame form must contain a "Freq", "Result" or "result" column, this is where the numerical value is read.
    • Probabilistic input must contain a dimension (in case of array) or a column (in case of data.frame) named "obs". Also: all cells must have values for each "obs" value.