## INPUT: data.ori (the dataset, columns are variables, rows are iid samples), ncat is an array with the number of categories per variable (each variable var must have values in 1:ncat[var]), thr is a threshold for chisq.test, classcol is the column of the class variable, verbose is a Boolean ## OUTPUT: an SPN inspired by the fitting idea of Gens&Domingos 2013 spn.learn <- function(data.ori, ncat=NULL, thr=0.01, classcol=ncol(data.ori), verb=FALSE) { ## clean up of data and computation of number of categories per variable (unless already given) data <- matrix(0, nrow=nrow(data.ori), ncol=ncol(data.ori)) ncat.new <- rep.int(0, ncol(data)) for(i in 1:ncol(data)) { data[,i] <- as.integer(data.ori[,i]) if(min(data[,i]) < 1) stop('data should be categorical and begin with 1') ncat.new[i] <- max(data[,i]) } if(is.null(ncat)) ncat <- ncat.new ## in this implementation, the root node is a sum node related to the class variables (kind of a discriminative approach) if(verb) cat(paste(Sys.time(),":: Creating sum root\n")) scope <- 1:ncol(data) root <- list(children=list(), scope=scope, weight=c(), n=nrow(data), type='sum') j <- 1 for(i in 1:ncat[classcol]) { members <- which(data[,classcol] == i) if(length(members) > 0) { ## our code for credal SPNs only works for sum nodes with 2 children (except for this root node), so nclusters is forced to 2 here root$children[[j]] <- spn.learn.aux(data[members,,drop=FALSE], ncat, scope=scope, thr=thr, nclusters=2, verb=verb) root$weight <- c(root$weight, length(members)) j <- j + 1 } } return(list(root=root,ncat=ncat)) } ## learning is done recursively, splitting the data horizontally by clustering and vertically by "independence" tests spn.learn.aux <- function(data, ncat, scope, thr, nclusters, verb, last.prod=FALSE) { n <- length(scope) m <- nrow(data) if(verb) cat(paste(Sys.time(),":: New node with",m,"points and",n,"vars, scope",paste(scope,collapse=' '),"\n")) if (n > 1) { if(!last.prod) { ## just to speed up, since never a product node will have a product node as child (easy to see that) if(verb) cat(paste(Sys.time(),":: Finding components\n")) ## let us build an undirected graph where two nodes (variables) are connected if they are dependent dep <- matrix(0,nrow=n,ncol=n) for(i in 1:(n-1)) { for(j in (i+1):n) { if(m > 2 && length(unique(data[,scope[i]])) > 1 && length(unique(data[,scope[j]])) > 1) { v <- suppressWarnings(chisq.test(data[,scope[i]],data[,scope[j]]))$p.value if(v < thr) { dep[j,i] <- 1 dep[i,j] <- 1 } } } } g <- graph_from_adjacency_matrix(dep,mode='undirected') ## and from such graph, take the components to form the children of the product node, as long as there are more than one component (single component means that we cannot split vertically at this moment, and in that case we move on to split horizontally clu <- clusters(g) if(clu$no > 1) { if(verb) cat(paste(Sys.time(),":: Found independency (",clu$no,"groups). Separating independent sets\n")) prodnode <- list(children=list(), scope=scope, n=m, type='prod') for(i in 1:clu$no) { prodnode$children[[i]] <- spn.learn.aux(data, ncat, scope=scope[which(clu$membership == i)], thr=thr, nclusters=nclusters, verb=verb, last.prod=TRUE) } return(prodnode) } if(verb) cat(paste(Sys.time()," :: No independencies found\n")) } } else { ## single variable in the scope ## has it a single value? If so, then place an indicator function (as leaf node) if (length(unique(data[,scope]))==1) { if(verb) cat(paste(Sys.time(),":: Creating new leaf\n")) return(list(scope=scope, value=data[1,scope], type='leaf')) } ## if multiple values for the variable are still present in the data, then use a sum node with indicator functions as children nclusters <- ncat[scope] sumnode <- list(children=list(), scope=scope, weight=c(), n=m, type='sum') for(i in 1:nclusters) { members <- which(data[,scope]==i) sumnode$children[[i]] <- list(scope=scope, value=i, type='leaf') sumnode$weight <- c(sumnode$weight, length(members)+1) } return(sumnode) } ## we were not able to cut vertically, so we run clustering of the data (each row is a point), and then we use the result of clustering to create the children of a sum node sumnode <- list(children=list(), scope=scope, weight=c(), n=m, type='sum') if(verb) cat(paste(Sys.time(),":: Clara clustering with",nrow(data),"rows and",nclusters,"clusters\n")) if(nclusters >= nrow(data)) clu.ind <- 1:nrow(data) else ## clara is very efficient as clustering, but not necessarily very accurate - can you live with that? :-) clu.ind <- clara(data[,scope,drop=FALSE],nclusters,samples=200,pamLike=TRUE)$clustering j <- 1 ## after clusters are found, build the children using that partition of the data for(i in 1:nclusters) { members <- which(clu.ind == i) if(length(members) > 0) { sumnode$children[[j]] <- spn.learn.aux(data[members,,drop=FALSE], ncat, scope=scope, thr=thr, nclusters=nclusters, verb=verb, last.prod=FALSE) sumnode$weight <- c(sumnode$weight, length(members)) j <- j + 1 } } return(sumnode) }