Opasnet Base Connection for R: Difference between revisions
Jump to navigation
Jump to search
mNo edit summary |
mNo edit summary |
||
Line 40: | Line 40: | ||
LEFT JOIN actobj | LEFT JOIN actobj | ||
ON obj.id = actobj.obj_id | ON obj.id = actobj.obj_id | ||
WHERE obj.ident = "', ident, '"', sep = '')) | <nowiki>WHERE obj.ident = "', ident, '"', sep = ''))</nowiki> | ||
series_list[nrow(series_list),1] | series_list[nrow(series_list),1] | ||
} | } | ||
Line 53: | Line 53: | ||
LEFT JOIN res | LEFT JOIN res | ||
ON cell.id = res.cell_id | ON cell.id = res.cell_id | ||
WHERE actobj.series_id = ', series_id, sep = '')) | <nowiki>WHERE actobj.series_id = ', series_id, sep = ''))</nowiki> | ||
} | } | ||
Line 65: | Line 65: | ||
LEFT JOIN obj | LEFT JOIN obj | ||
ON loc.obj_id_i = obj.id | ON loc.obj_id_i = obj.id | ||
WHERE loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = '')) | <nowiki>WHERE loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = ''))</nowiki> | ||
} | } | ||
Line 84: | Line 84: | ||
ON loccell.loc_id = loc.id | ON loccell.loc_id = loc.id | ||
WHERE loccell.cell_id | WHERE loccell.cell_id | ||
BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = '')) | <nowiki>BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = ''))</nowiki> | ||
as.numeric(indexes[,1]) | as.numeric(indexes[,1]) | ||
} | } | ||
Line 95: | Line 95: | ||
LEFT JOIN loc | LEFT JOIN loc | ||
ON loccell.loc_id = loc.id | ON loccell.loc_id = loc.id | ||
WHERE loc.obj_id_i = ', index, ' AND loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = '')) | <nowiki>WHERE loc.obj_id_i = ', index, ' AND loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = ''))</nowiki> | ||
as.numeric(locations[,1]) | as.numeric(locations[,1]) | ||
} | } | ||
Line 106: | Line 106: | ||
LEFT JOIN loc | LEFT JOIN loc | ||
ON loccell.loc_id = loc.id | ON loccell.loc_id = loc.id | ||
WHERE loc.obj_id_i = ', index, ' AND loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = '')) | <nowiki>WHERE loc.obj_id_i = ', index, ' AND loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = ''))</nowiki> | ||
as.numeric(locations[,1]) | as.numeric(locations[,1]) | ||
} | } | ||
Line 115: | Line 115: | ||
SELECT location | SELECT location | ||
FROM loc | FROM loc | ||
WHERE id = ', loc_id, sep = '')) | <nowiki>WHERE id = ', loc_id, sep = ''))</nowiki> | ||
} | } | ||
Line 123: | Line 123: | ||
SELECT name | SELECT name | ||
FROM obj | FROM obj | ||
WHERE id = ', ind_id, sep = '')) } | <nowiki>WHERE id = ', ind_id, sep = '')) }</nowiki> | ||
#Get locations for a specific cell | #Get locations for a specific cell | ||
Line 132: | Line 132: | ||
LEFT JOIN loc | LEFT JOIN loc | ||
ON loccell.loc_id = loc.id | ON loccell.loc_id = loc.id | ||
WHERE loccell.cell_id = ', cell_id, sep = '')) | <nowiki>WHERE loccell.cell_id = ', cell_id, sep = ''))</nowiki> | ||
} | } | ||
Revision as of 06:51, 23 August 2010
Moderator:Teemu R (see all) |
|
Upload data
|
Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code to R; this defines the functions, after which they can be called in R. I've tried to explain the code in comments (#). This is a work in progress, if you have comments or new ideas post them using the comment templates on a new line after the one you want to comment, I'm no expert in programming so input is appreciated :P.
Package dependencies
These packages are required for most of the code to work. Load these first.
- utils
- RODBC
Setup
Establishes the connection to the database. Copy paste first.
db <- odbcConnect("opasnet_base") #assuming "opasnet_base" is the defined DSN
Downloading data
- At the moment very sensitive to glitches in the database (multiple locations for the same index for a single cell), and does not handle large quantities of data well. Ideally should work, testing needed...
- The only page that I came across that does not give errors is Population of Europe, though probably because of the size of the data only the first hundred or so cells become populated, incorrectly...
- A lot of cells in data on many pages seem to be missing location data altogether...
- The code seems to break up in a different way every time I copy paste it to a new instance of R...
- This is related to the paste function used for the SQL queries and quotes in the query, escape characters don't seem to work... Investigating... Worked when queries were on a single line...
- Also related to Opasnet formatting, works fine copied from .txt file.
- This is related to the paste function used for the SQL queries and quotes in the query, escape characters don't seem to work... Investigating... Worked when queries were on a single line...
Low level functions (mainly sql queries)
Copy paste these second.
#Get latest series id op_baseGetLatest <- function(ident) { series_list <- sqlQuery(db, paste(' SELECT actobj.series_id FROM obj LEFT JOIN actobj ON obj.id = actobj.obj_id WHERE obj.ident = "', ident, '"', sep = '')) series_list[nrow(series_list),1] } #Get results (cell id, result) op_baseGetData <- function(series_id) { sqlQuery(db, paste(' SELECT cell.id, res.result FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id = res.cell_id WHERE actobj.series_id = ', series_id, sep = '')) } #Not in use: Get indexes and locations per cell op_baseGetDims <- function(cell_id_list) { sqlQuery(db, paste(' SELECT loccell.cell_id, loc.id, loc.location, loc.obj_id_i, obj.name FROM loccell LEFT JOIN loc ON loccell.loc_id = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = '')) } #Experimental: Sort locations by index #tapply(dims_table[,3], dims_table[,5], paste) #Not in use: Get indexes from dims_table op_baseFindInd <- function(dims_table) { factor(op_baseDims[,4]) #could also use "obj_id_i"(obj.name for index) instead of 5 for column name } #Get indexes op_baseGetInd <- function (cell_id_list) { indexes <- sqlQuery(db, paste(' SELECT DISTINCT loc.obj_id_i FROM loccell LEFT JOIN loc ON loccell.loc_id = loc.id WHERE loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = '')) as.numeric(indexes[,1]) } #Get locations op_baseGetLoc <- function(index, cell_id_list) { locations <- sqlQuery(db, paste(' SELECT DISTINCT loc.id FROM loccell LEFT JOIN loc ON loccell.loc_id = loc.id WHERE loc.obj_id_i = ', index, ' AND loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = '')) as.numeric(locations[,1]) } #Get location names (will not work properly if different indexes have locations with similar name) op_baseGetLocNames <- function(index, cell_id_list) { locations <- sqlQuery(db, paste(' SELECT DISTINCT loc.location FROM loccell LEFT JOIN loc ON loccell.loc_id = loc.id WHERE loc.obj_id_i = ', index, ' AND loccell.cell_id BETWEEN ', cell_id_list[1,1], ' AND ', cell_id_list[nrow(cell_id_list),1], sep = '')) as.numeric(locations[,1]) } #Get location name (simple) op_baseGetLocName <- function(loc_id) { sqlQuery(db, paste(' SELECT location FROM loc WHERE id = ', loc_id, sep = '')) } #Get index name op_baseGetIndName <- function(ind_id) { sqlQuery(db, paste(' SELECT name FROM obj WHERE id = ', ind_id, sep = '')) } #Get locations for a specific cell op_baseGetLocs <- function(cell_id) { sqlQuery(db, paste(' SELECT DISTINCT loc.location, loc.id FROM loccell LEFT JOIN loc ON loccell.loc_id = loc.id WHERE loccell.cell_id = ', cell_id, sep = '')) }
High Level function
Copy paste this last.
op_baseArray <- function(page_ident) { #Basic data downloading op_baseLatest <- op_baseGetLatest(page_ident) op_baseData <- op_baseGetData(op_baseLatest) #cell,result #final_array <- op_baseData[,2] #ideal case (data in proper order) #op_baseDims <- op_baseGetDims(op_baseData) #op_baseIndexes <- op_baseFindInd(op_baseDims) op_baseInd <- op_baseGetInd(op_baseData) #Array structure building IndLengths <- 1:3 #blank vector for index lengths IndList <- rep(vector("list", 1), length(op_baseInd)) #blank list of appropriate length for indexes and #locations for naming the dimensions of the array IndIdList <- rep(vector("list", 1), length(op_baseInd)) #blank list of appropriate length for ids of indexes IndNames <- c(1) #blank vector for index names #temp <- c(1) #temporary vector for (i in 1:length(op_baseInd)) { IndIdList[[i]] <- op_baseGetLoc(op_baseInd[i], op_baseData) #fetch location ids in index to a list, #assumed every cell has locations for all unique index id values in loc.obj_id_i in the range of cells IndNames[i] <- paste(op_baseGetIndName(op_baseInd[i])[1,1]) #fetch index name to a vector IndLengths[i] <- length(IndIdList[[i]]) #backup: op_baseGetLoc(op_baseInd[i], op_baseData)) for (j in 1:IndLengths[i]) { #temp[[i]][j] <- op_baseGetLocName(IndIdList[[i]][j]) IndList[[i]][j] <- paste(op_baseGetLocName(IndIdList[[i]][j])[1,1]) } #IndList[[i]] <- temp[[i]] } final_array <- rep(NA, prod(IndLengths)) dim(final_array) <- IndLengths names(IndList) <- IndNames dimnames(final_array) <- IndList #Cell population LocList <- c(1) #blank vector for locations per cell LocMap <- c(1) #blank vector for location to position mapping CellPosition <- c(1) #blank vector for cell position for (i in 1:length(op_baseData[,1])) { #for each cell LocList <- paste(op_baseGetLocs(op_baseData[i,1])[,1]) #get list of locations for (j in 1:length(LocList)) { #for each location per cell LocMap <- 1:IndLengths[j] #get available positions in given index, assumed correct order of indexes in LocList names(LocMap) <- dimnames(final_array)[[j]] #add names (locations) of positions in given index CellPosition[j] <- LocMap[LocList[j]] #match cell specific locations to positions in index } final_array[CellPosition] <- op_baseData[i,2] #insert result in array in position determined by CellPosition } final_array #print/return the final array }
Usage
variable <- op_baseArray("<page identifier>")