Opasnet Base Connection for R: Difference between revisions

From Opasnet
Jump to navigation Jump to search
mNo edit summary
Line 56: Line 56:
  variable <- op_baseGetData("page identifier")
  variable <- op_baseGetData("page identifier")


*Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, result.
*Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result.


==Manipulating data==
==Manipulating data==

Revision as of 11:48, 4 October 2010


Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code to R; this defines the functions, after which they can be called for in R.

Package dependencies

These packages are required for most of the code to work. Load these first (copy-pasteable).

library(utils)
library(RODBC)

Setup

Establishes the connection to the database. Copy paste first.

db <- odbcConnect("opasnet_base") 
  • Assuming "opasnet_base" is a correctly defined DSN.
    • Note that uploading requires writing permissions.

Downloading data

Functions

  • Now simplified, a lot faster. Copy paste second.
op_baseGetData <- function(ident) {
	obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=))[1,1]
	series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, 
		' ORDER BY series_id DESC LIMIT 1'))[1,1]
	Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.name AS ind, loc.location AS loc, res.result', 
		' FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', 
		' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id',
		' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, 
		' AND actobj.series_id = ', series_id, ' ORDER BY cell.id, res.obs, obj.name', sep = ))
	nind <- length(levels(Data[,3]))
	ncell <- nrow(Data)/nind
	dataframe <- data.frame(NA)
	dataframe <- Data[1:ncell*nind, c(1,2)]
	for (i in 1:nind) {
		dataframe[,2 + i] <- Data[1:ncell*nind - i + 1, 4]
		colnames(dataframe)[2 + i] <- as.character(Data[nind - i + 1, 3])
	}
	dataframe[,3 + nind] <- Data[1:ncell*nind, 5]
	colnames(dataframe)[3 + nind] <- "Result"
	rownames(dataframe) <- 1:ncell
	dataframe
}

Usage

variable <- op_baseGetData("page identifier")
  • Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result.

Manipulating data

Functions

DataframeToArray <- function(dataframe) {
	DimNames <- rep(vector("list", 1), ncol(dataframe) - 2)
	names(DimNames) <- c(colnames(dataframe)[2:(ncol(dataframe) - 1)])
	nind <- length(names(DimNames))
	indlengths <- 0
	for (i in 1:nind) {
		DimNames[[i]] <- levels(factor(dataframe[,1 + i]))
		indlengths[i] <- length(DimNames[[i]])
	}
	array <- rep(NA, prod(indlengths))
	dim(array) <- indlengths
	dimnames(array) <- DimNames
	array[as.matrix(dataframe[,c(1:nind + 1)])] <- dataframe[,ncol(dataframe)]
	array
}

Usage

variable2 <- DataframeToArray(variable1)
  • variable 1 must be in the same format as the result when downloading.
    • first column (id) is ignored
  • variable1 <- function(variable1) is allowed

Other useful stuff

as.data.frame(as.table(array))
dataframe[is.na(dataframe[,ncol(dataframe)])==FALSE,]
  • First line returns an array from a data frame.
  • Second line returns rows without empty values.
dataframe[grep("location", dataframe$index, ignore.case = TRUE),]
  • Returns all rows of a data frame where column "index" value is "location".

Uploading data

Tested and works. Cell writing is efficient with even with huge data, index writing requires more work. Probabilistic data not supported.

Low level

library(utils)
library(RODBC)

dbSimpleRead <- function(db, table, column, filtercol, filter) {
	sqlQuery(db, paste('SELECT ', column, ' FROM ', table, ' WHERE ', filtercol, ' = "', filter, '"', sep = ))[1,1]
}

dbLessSimpleRead <- function(db, table, column, filtercol1, filter1, filtercol2, filter2) {
	sqlQuery(db, paste('SELECT ', column, ' FROM ', table, ' WHERE ', filtercol1, ' = "', filter1, '" AND ', filtercol2, ' = "', filter2, '"', sep = ))[1,1]
}

SimplePrompt <- function(question) {
	readline(paste(question, "\n", sep = ""))
}

High level

op_baseWriteArray <- function(db, array, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL) {
	
	originaldataframe <- as.data.frame(as.table(array))
	
	values <- originaldataframe[,"Freq"]
	cond <- (is.na(values) == FALSE)
	
	dataframe <- originaldataframe[cond,]
	
	#Add page to database (if it doesn't already exist)
	
	if (is.null(ident)==TRUE) if (interactive()) ident <- SimplePrompt("What is the identifier of this object?")
	wiki_name <- substring(ident, 1, 5)
	if (wiki_name=="Op_en") {wiki_id <- 1; page <- substring(ident, 6, nchar(ident))} else if (wiki_name=="Op_fi") {wiki_id <- 2; page <- substring(ident, 6, nchar(ident))} else {wiki_id <- 0; page <- 0}
	obj_id <- dbSimpleRead(db, "obj", "id", "ident", ident)
	if (is.na(obj_id)==TRUE) {
		if (is.null(name)==TRUE) if (interactive()) name <- SimplePrompt("What is the name of this object?") else error
		if (is.null(unit)==TRUE) if (interactive()) unit <- SimplePrompt("What is the unit of this object?") else error
		if (is.null(objtype_id)==TRUE) if (interactive()) objtype_id <- SimplePrompt(paste("What type of object is this (id)?", paste(paste(sqlQuery(db, "SELECT id FROM objtype")[,1], sqlQuery(db, "SELECT objtype FROM objtype")[,1], sep = " - "), collapse = ", "), collapse = " ")) else error
		sqlQuery(db, paste('INSERT INTO obj (ident, name, unit, objtype_id, page, wiki_id) VALUES ("', paste(ident, name, unit, sep = '","'), '",', paste(objtype_id, page, wiki_id, sep = ','), ')', sep = ))
		obj_id <- dbSimpleRead(db, "obj", "id", "ident", ident)
	}
	
	#Write act and actobj
	
	if (is.null(who)==TRUE) if (interactive()) who <- SimplePrompt("What is the name of the uploader?") else error
	sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (4, "', who, '","R upload")', sep = ))
	act_id <- sqlQuery(db, "SELECT id FROM act ORDER BY id DESC LIMIT 1")[1,1]
	sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id) VALUES (', paste(act_id, obj_id, act_id, sep = ','), ')', sep = ))
	actobj_id <- dbSimpleRead(db, "actobj", "id", "act_id", act_id)

	#Write indexes and locations
	x <- 1
	DimIds <- list()
	IndId <- 0
	Ind <- 0
	loc <- 0
	LocId <- 0
	for (i in 1:(ncol(dataframe) - x)) {
		DimIds[[i]] <- rep(NA, length(levels(factor(dataframe[,i]))))
		IndId <- dbSimpleRead(db, "obj", "id", "ident", dimnames(dataframe)[[2]][i])
		if (is.na(IndId)==TRUE) {
			sqlQuery(db, paste('INSERT INTO obj (ident, name, objtype_id) VALUES ("', paste(dimnames(dataframe)[[2]][i], dimnames(dataframe)[[2]][i], sep = '","'), '",', 6, ')', sep = ''))
			IndId <- dbSimpleRead(db, "obj", "id", "ident", dimnames(dataframe)[[2]][i])
		}
		for (j in 1:length(levels(factor(dataframe[,i])))) { #1:length(dimnames(array)[[i]])) { 
			LocId <- dbLessSimpleRead(db, "loc", "id", "location", levels(factor(dataframe[,i]))[j], "obj_id_i", IndId)
			if (is.na(LocId)==TRUE) {
				sqlQuery(db, paste('INSERT INTO loc (obj_id_i, location) VALUES (', IndId, ',"', levels(factor(dataframe[,i]))[j], '")', sep = ''))
				LocId <- dbLessSimpleRead(db, "loc", "id", "location", levels(factor(dataframe[,i]))[j], "obj_id_i", IndId)
			}
			DimIds[[i]][j] <- LocId
		}
		names(DimIds)[i] <- IndId
	}
	
	# Changing location names in table into ids
	
	for (i in 1:(ncol(dataframe)-1)) {
		dataframe[,i] <- factor(dataframe[,i])
		levels(dataframe[,i]) <- DimIds[[i]]
	}
	
	maxrows <- 5000
	
	#Writing cell
	
	n <- 1
	cellQuery <- NA
	cellQuery[1:nrow(dataframe)*6-5] <- actobj_id
	cellQuery[1:nrow(dataframe)*6-4] <- ","
	cellQuery[1:nrow(dataframe)*6-3] <- dataframe[,"Freq"]
	cellQuery[1:nrow(dataframe)*6-2] <- ","
	cellQuery[1:nrow(dataframe)*6-1] <- n
	cellQuery[1:(nrow(dataframe)-1)*6] <- "),("
	if (nrow(dataframe)>=maxrows) {
		for (i in 1:(nrow(dataframe)%/%maxrows)) {
			sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[((i-1)*6*maxrows+1):(i*6*maxrows-1)], collapse = ), ')', sep = ))
		}
	}
	sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[(nrow(dataframe)%/%maxrows*6*maxrows+1):length(cellQuery)], collapse = ), ')', sep = ))
	
	#Writing res
	
	cell_id <- rev(sqlQuery(db, paste("SELECT id FROM cell WHERE actobj_id = ", actobj_id, " ORDER BY ID DESC LIMIT ", nrow(dataframe), sep = ""))[,1])
	resQuery <- NA
	resQuery[1:length(cell_id)*4-3] <- cell_id
	resQuery[1:length(cell_id)*4-2] <- ","
	resQuery[1:length(cell_id)*4-1] <- dataframe[,"Freq"]
	resQuery[1:(length(cell_id)-1)*4] <- "),("
	if (length(cell_id)>=maxrows) {
		for (i in 1:(length(cell_id)%/%maxrows)) {
			sqlQuery(db, paste('INSERT INTO res (cell_id, result) VALUES (', paste(resQuery[((i-1)*4*maxrows+1):(i*4*maxrows-1)], collapse = ), ')', sep = ))
		}
	}
	sqlQuery(db, paste('INSERT INTO res (cell_id, result) VALUES (', paste(resQuery[(length(cell_id)%/%maxrows*4*maxrows+1):length(resQuery)], collapse = ), ')', sep = ))
	
	#Writing loccell
	
	locidmatrix <- rep(NA, (ncol(dataframe)-1)*length(cell_id))
	dim(locidmatrix) <- c(ncol(dataframe)-1,length(cell_id))
	for (i in 1:(ncol(dataframe)-1)) {
		locidmatrix[i,] <- as.character(dataframe[,i])
	}
	loccellQuery <- rep(NA, 4*(ncol(dataframe)-1)*length(cell_id))
	dim(loccellQuery) <- c(4*(ncol(dataframe)-1), length(cell_id))
	loccellQuery[1:(ncol(dataframe)-1)*4-3, 1:length(cell_id)] <- rep(cell_id, each = ncol(dataframe) - 1)
	loccellQuery[1:(ncol(dataframe)-1)*4-2, 1:length(cell_id)] <- ","
	loccellQuery[1:(ncol(dataframe)-1)*4-1, 1:length(cell_id)] <- locidmatrix
	loccellQuery[1:(ncol(dataframe)-1)*4, 1:length(cell_id)] <- "),("
	loccellQuery[(ncol(dataframe)-1)*4, length(cell_id)] <- ""
	if (length(cell_id)>=maxrows) {
		for (i in 1:(length(cell_id)%/%maxrows)) {
			loccellQuery[(ncol(dataframe)-1)*4, i*maxrows] <- ""
			sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,((i-1)*maxrows+1):(i*maxrows)], collapse = ), ')', sep = ))
		}
	}
	sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[,(length(cell_id)%/%maxrows*maxrows+1):length(cell_id)], collapse = ), ')', sep = ))
}

Usage

op_baseWriteArray(db, array, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL)

Db and array must be defined, the rest if not defined is prompted for by the function.