Opasnet Base Connection for R

From Opasnet
Revision as of 08:15, 1 October 2010 by Teemu R (talk | contribs)
Jump to navigation Jump to search


Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code to R; this defines the functions, after which they can be called in the R instance.

Package dependencies

These packages are required for most of the code to work. Load these first (copy-pasteable).

library(utils)
library(RODBC)

Setup

Establishes the connection to the database. Copy paste first.

db <- odbcConnect("opasnet_base") 
  • Assuming "opasnet_base" is a correctly defined DSN.
    • Note that uploading requires writing permissions.

Downloading data

  • Now simplified, a lot faster. Copy paste second.
op_baseGetData <- function(ident) {
	obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=))[1,1]
	series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, 
		' ORDER BY series_id DESC LIMIT 1'))[1,1]
	Indices <- sqlQuery(db, paste('SELECT DISTINCT obj.name AS ind, loc.location AS loc FROM', 
		' actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN loccell ON cell.id', 
		' = loccell.cell_id LEFT JOIN loc ON loccell.loc_id = loc.id LEFT JOIN obj ON loc.obj_id_i', 
		' = obj.id WHERE actobj.obj_id = ', obj_id, ' AND actobj.series_id = ', series_id, 
		' ORDER BY cell.id, obj.id, loc.id', sep = ))
	DimNames <- rep(vector("list", 1), length(levels(Indices[,1])) + 1)
	names(DimNames) <- c(levels(Indices[,1]), "Obs")
	nind <- length(names(DimNames)) - 1
	indlengths <- 0
	for (i in 1:nind) {
		DimNames[[i]] <- Indices[grep(names(DimNames)[i], Indices$ind),2]
		indlengths[i] <- length(DimNames[[i]])
	}
	Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.name AS ind, loc.location AS loc, res.result', 
		' FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', 
		' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id',
		' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, 
		' AND actobj.series_id = ', series_id, ' ORDER BY cell.id, res.obs, obj.name', sep = ))
	DimNames[[nind + 1]] <- as.numeric(levels(factor(Data[,2])))
	indlengths[nind + 1] <- length(DimNames[[nind + 1]])
	array <- rep(NA, prod(indlengths))
	dim(array) <- indlengths
	dimnames(array) <- DimNames
	ncell <- nrow(Data)/nind
	array[matrix(c(matrix(Data[,4], ncell, nind, byrow = TRUE), Data[1:ncell*nind, 2]), ncell, nind + 1)] <- 
	Data[1:ncell*nind,5]
	array
}

Usage

variable <- op_baseGetData("page identifier")

Uploading data

Tested and works. Cell writing is efficient with even with huge data, index writing requires more work. Probabilistic data not supported.

Low level

library(utils)
library(RODBC)

dbSimpleRead <- function(db, table, column, filtercol, filter) {
	sqlQuery(db, paste('SELECT ', column, ' FROM ', table, ' WHERE ', filtercol, ' = "', filter, '"', sep = ))[1,1]
}

dbLessSimpleRead <- function(db, table, column, filtercol1, filter1, filtercol2, filter2) {
	sqlQuery(db, paste('SELECT ', column, ' FROM ', table, ' WHERE ', filtercol1, ' = "', filter1, '" AND ', filtercol2, ' = "', filter2, '"', sep = ))[1,1]
}

SimplePrompt <- function(question) {
	readline(paste(question, "\n", sep = ""))
}

High level

op_baseWriteArray <- function(db, array, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL) {
	
	originaldataframe <- as.data.frame(as.table(array))
	
	values <- originaldataframe[,"Freq"]
	cond <- (is.na(values) == FALSE)
	
	dataframe <- originaldataframe[cond,]
	
	#Add page to database (if it doesn't already exist)
	
	if (is.null(ident)==TRUE) if (interactive()) ident <- SimplePrompt("What is the identifier of this object?")
	wiki_name <- substring(ident, 1, 5)
	if (wiki_name=="Op_en") {wiki_id <- 1; page <- substring(ident, 6, nchar(ident))} else if (wiki_name=="Op_fi") {wiki_id <- 2; page <- substring(ident, 6, nchar(ident))} else {wiki_id <- 0; page <- 0}
	obj_id <- dbSimpleRead(db, "obj", "id", "ident", ident)
	if (is.na(obj_id)==TRUE) {
		if (is.null(name)==TRUE) if (interactive()) name <- SimplePrompt("What is the name of this object?") else error
		if (is.null(unit)==TRUE) if (interactive()) unit <- SimplePrompt("What is the unit of this object?") else error
		if (is.null(objtype_id)==TRUE) if (interactive()) objtype_id <- SimplePrompt(paste("What type of object is this (id)?", paste(paste(sqlQuery(db, "SELECT id FROM objtype")[,1], sqlQuery(db, "SELECT objtype FROM objtype")[,1], sep = " - "), collapse = ", "), collapse = " ")) else error
		sqlQuery(db, paste('INSERT INTO obj (ident, name, unit, objtype_id, page, wiki_id) VALUES ("', paste(ident, name, unit, sep = '","'), '",', paste(objtype_id, page, wiki_id, sep = ','), ')', sep = ))
		obj_id <- dbSimpleRead(db, "obj", "id", "ident", ident)
	}
	
	#Write act and actobj
	
	if (is.null(who)==TRUE) if (interactive()) who <- SimplePrompt("What is the name of the uploader?") else error
	sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (4, "', who, '","R upload")', sep = ))
	act_id <- sqlQuery(db, "SELECT id FROM act ORDER BY id DESC LIMIT 1")[1,1]
	sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id) VALUES (', paste(act_id, obj_id, act_id, sep = ','), ')', sep = ))
	actobj_id <- dbSimpleRead(db, "actobj", "id", "act_id", act_id)

	#Write indexes and locations
	x <- 1
	DimIds <- list()
	IndId <- 0
	Ind <- 0
	loc <- 0
	LocId <- 0
	for (i in 1:(ncol(dataframe) - x)) {
		DimIds[[i]] <- rep(NA, length(levels(factor(dataframe[,i]))))
		IndId <- dbSimpleRead(db, "obj", "id", "ident", dimnames(dataframe)[[2]][i])
		if (is.na(IndId)==TRUE) {
			sqlQuery(db, paste('INSERT INTO obj (ident, name, objtype_id) VALUES ("', paste(dimnames(dataframe)[[2]][i], dimnames(dataframe)[[2]][i], sep = '","'), '",', 6, ')', sep = ''))
			IndId <- dbSimpleRead(db, "obj", "id", "ident", dimnames(dataframe)[[2]][i])
		}
		for (j in 1:length(levels(factor(dataframe[,i])))) { #1:length(dimnames(array)[[i]])) { 
			LocId <- dbLessSimpleRead(db, "loc", "id", "location", levels(factor(dataframe[,i]))[j], "obj_id_i", IndId)
			if (is.na(LocId)==TRUE) {
				sqlQuery(db, paste('INSERT INTO loc (obj_id_i, location) VALUES (', IndId, ',"', levels(factor(dataframe[,i]))[j], '")', sep = ''))
				LocId <- dbLessSimpleRead(db, "loc", "id", "location", levels(factor(dataframe[,i]))[j], "obj_id_i", IndId)
			}
			DimIds[[i]][j] <- LocId
		}
		names(DimIds)[i] <- IndId
	}
	
	# Changing location names in table into ids
	
	for (i in 1:(ncol(dataframe)-1)) {
		dataframe[,i] <- factor(dataframe[,i])
		levels(dataframe[,i]) <- DimIds[[i]]
	}
	
	maxrows <- 5000
	
	#Writing cell
	
	n <- 1
	cellQuery <- NA
	cellQuery[1:nrow(dataframe)*6-5] <- actobj_id
	cellQuery[1:nrow(dataframe)*6-4] <- ","
	cellQuery[1:nrow(dataframe)*6-3] <- dataframe[,"Freq"]
	cellQuery[1:nrow(dataframe)*6-2] <- ","
	cellQuery[1:nrow(dataframe)*6-1] <- n
	cellQuery[1:(nrow(dataframe)-1)*6] <- "),("
	if (nrow(dataframe)>=maxrows) {
		for (i in 1:(nrow(dataframe)%/%maxrows)) {
			sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[((i-1)*6*maxrows+1):(i*6*maxrows-1)], collapse = ), ')', sep = ))
		}
	}
	sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[(nrow(dataframe)%/%maxrows*6*maxrows+1):length(cellQuery)], collapse = ), ')', sep = ))
	
	#Writing res
	
	cell_id <- rev(sqlQuery(db, paste("SELECT id FROM cell WHERE actobj_id = ", actobj_id, " ORDER BY ID DESC LIMIT ", nrow(dataframe), sep = ""))[,1])
	resQuery <- NA
	resQuery[1:length(cell_id)*4-3] <- cell_id
	resQuery[1:length(cell_id)*4-2] <- ","
	resQuery[1:length(cell_id)*4-1] <- dataframe[,"Freq"]
	resQuery[1:(length(cell_id)-1)*4] <- "),("
	if (length(cell_id)>=maxrows) {
		for (i in 1:(length(cell_id)%/%maxrows)) {
			sqlQuery(db, paste('INSERT INTO res (cell_id, result) VALUES (', paste(resQuery[((i-1)*4*maxrows+1):(i*4*maxrows-1)], collapse = ), ')', sep = ))
		}
	}
	sqlQuery(db, paste('INSERT INTO res (cell_id, result) VALUES (', paste(resQuery[(length(cell_id)%/%maxrows*4*maxrows+1):length(resQuery)], collapse = ), ')', sep = ))
	
	#Writing loccell
	
	locidmatrix <- rep(NA, (ncol(dataframe)-1)*length(cell_id))
	dim(locidmatrix) <- c(ncol(dataframe)-1,length(cell_id))
	for (i in 1:(ncol(dataframe)-1)) {
		locidmatrix[i,] <- as.character(dataframe[,i])
	}
	loccellQuery <- rep(NA, 4*(ncol(dataframe)-1)*length(cell_id))
	dim(loccellQuery) <- c(4*(ncol(dataframe)-1), length(cell_id))
	loccellQuery[1:(ncol(dataframe)-1)*4-3, 1:length(cell_id)] <- rep(cell_id, each = ncol(dataframe) - 1)
	loccellQuery[1:(ncol(dataframe)-1)*4-2, 1:length(cell_id)] <- ","
	loccellQuery[1:(ncol(dataframe)-1)*4-1, 1:length(cell_id)] <- locidmatrix
	loccellQuery[1:(ncol(dataframe)-1)*4, 1:length(cell_id)] <- "),("
	loccellQuery[(ncol(dataframe)-1)*4, length(cell_id)] <- ""
	if (length(cell_id)>=maxrows) {
		for (i in 1:(length(cell_id)%/%maxrows)) {
			loccellQuery[(ncol(dataframe)-1)*4, i*maxrows] <- ""
			sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,((i-1)*maxrows+1):(i*maxrows)], collapse = ), ')', sep = ))
		}
	}
	sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,(length(cell_id)%/%maxrows*maxrows+1):length(cell_id)], collapse = ), ')', sep = ))
}

Usage

op_baseWriteArray(db, array, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL)

Db and array must be defined, the rest if not defined is prompted for by the function.