In survey research, our datasets nearly always comprise variables with mixed measurement levels - in particular, nominal, ordinal and continuous, or in R-speak, unordered factors, ordered factors and numeric variables. Sometimes it is useful to be able to do blanket tests of one set of variables (possibly of mixed level) against another without having to worry about which test to use.
For this we have developed an omni function which can do binary tests of significance between pairs of variables, either of which can be any of the three aforementioned levels. We have also generalised the function to include other kinds of variables such as lat/lon for GIS applications, and to distinguish between integer and continuous variables, but the version I am posting below sticks to just those three levels. Certainly one can argue about which tests are applicable in which precise case, but at least the principle might be interesting to my dear readeRs.
I will write another post soon about using this function in order to display heatmaps of significance levels.
The function returns the p value, together with attributes for the sample size and test used. It is also convenient to test if the two variables are literally the same variable. You can do this by providing your variables with an attribute “varnames”. So if attr(x,“varnames”) is the same as attr(y,“varnames”) then the function returns 1 (instead of 0, which would be the result if you hadn’t provided those attributes).
#some helper functions
classer=function(x){
y=class(x)[1]
s=switch(EXPR=y,"integer"="con","factor"="nom","character"="str","numeric"="con","ordered"="ord","logical"="log")
s
}
xc=function(stri,sepp=" ") (strsplit(stri, sepp)[[1]]) #so you can type
xc("red blue green") instead of c("red","blue","green")
#now comes the main function
xtabstat=function(v1=xy,v2,level1="nom",level2="nom",spv=F,...){
p=1
if(length(unique(v1))<2 | length(unique(v2))<2) p else {
havevarnames=!is.null(attr(v1,"varnames")) &
!is.null(attr(v2,"varnames"))
notsame=T; if
(havevarnames)notsame=attr(v1,"varnames")!=attr(v2,"varnames")
if(!havevarnames) warning(paste("If you don't provide varnames I can't
be sure the two variables are not
identical"),attr(v2,"label"),attr(v2,"label"))
if(notsame | !havevarnames){
if(min(length(which(table(v1)!=0)),length(which(table(v2)!=0)))>1) {
level1=classer(v1)
level2=classer(v2)
if(level1=="str") level1="nom"
if(level2=="str") level2="nom"
# if(attr(v1,"ncol")==2 & attr(v2,"ncol")==9)
if(level1 %in% xc("nom geo") & level2 %in% xc("nom geo"))
{if(class(try(chisq.test(v1,v2,...)))!="try-error"){
pp=chisq.test(v1,factor(v2),simulate.p.value=spv,...)
p=pp$p.value;attr(p,"method")="Chi-squared test"
attr(p,"estimate")=pp$statistic
}else p=1
}
else if(level1=="ord" & level2 %in% xc("nom geo"))
{if(class(try(kruskal.test(v1,factor(v2),...)))!="try-error"){
pp=kruskal.test(v1,factor(v2),...)
ppp<<-pp
p=pp$p.value
attr(p,"estimate")=pp$statistic
} else {
p=1
attr(p,"method")="Kruskal test"
}
}
else if(level1 %in% xc("nom geo") & level2=="ord")
{if(class(try(kruskal.test(v2,factor(v1),...)))!="try-error"){
pp=kruskal.test(v2,factor(v1),...)
p=pp$p.value;attr(p,"estimate")=pp$statistic
} else {
p=1
attr(p,"method")="Kruskal test"
}
}
else if((level1=="ord" & level2=="ord") | (level1=="ord" &
level2=="con") | (level1=="con" & level2=="ord"))
{if(class(try(cor.test(as.numeric(v1),as.numeric
(v2),method="spearman",...)))!="try-error")
{pp=cor.test(as.numeric(v1),as.numeric
(v2),method="spearman",...);p=pp$p.value;attr(p,"method")="Spearman
rho.";attr(p,"estimate")=pp$estimate} else cat("not enough finite
observations for Spearman")}
else if( level1=="con" & level2 %in% xc("nom geo")) {
if(class(try(anova(lm(as.numeric(v1)~v2))))!="try-error"){
pp=anova(lm(as.numeric(v1)~v2));p=pp$"Pr(>F)"[1];attr(p,"estimate")=pp["F
value"];attr(p,"method")="ANOVA F"
}else p=1}
else if( level1 %in% xc("nom geo") & level2 %in% xc("con")) {
if(class(try(anova(lm(as.numeric(v2)~v1))))!="try-error"){
pp=anova(lm(as.numeric(v2)~v1));p=pp$"Pr(>F)"[1];attr(p,"estimate")=pp["F
value"];attr(p,"method")="ANOVA F"
}else p=1}
else if( level1=="con" & level2 %in% xc("ord")) {
if(class(try(anova(lm(as.numeric(v1)~v2))))!="try-error"){
pp=anova(lm(as.numeric(v1)~v2));p=pp$"Pr(>F)"[1];attr(p,"estimate")=pp["F
value"];attr(p,"method")="ANOVA F"
}else p=1}
else if( level1=="ord" & level2 %in% xc("con")) {
if(class(try(anova(lm(as.numeric(v2)~v1))))!="try-error"){
pp=anova(lm(as.numeric(v2)~v1));p=pp$"Pr(>F)"[1];attr(p,"estimate")=pp["F
value"];attr(p,"method")="ANOVA F"
}else p=1}
##TODO think if these are the best tests
else if(level1=="con" & level2=="con")
{
# ;
pp=cor.test(as.numeric(v1),as.numeric(v2))
p=pp$p.value
attr(p,"method")="Pearson correlation"
attr(p,"estimate")=pp$estimate
}
# else if(level1=="str" | level2 =="str") stop(P("You are trying to
carry out stats tests for a string variable",attr(v1,"varnames")," or
",attr(v2,"varnames"),". You probably want to convert to nominal."))
else {p=1
attr(p,"estimate")=NULL
}
attr(p,"N")=nrow(na.omit(data.frame(v1,v2)))
}
} else {p=1;attr(p,"N")=sum(!is.na(v1))} #could put stuff here for
single-var analysis
if(is.na(p))p=1
p
}
}
## now let's try this out on a mixed dataset. Load mtcars and convert
some vars to ordinal and nominal.
mt=mtcars
mt$gear=factor(mt$gear,ordered=T)
mt$cyl=factor(mt$cyl,ordered=F)
s=sapply(mt,function(x){sapply(mt,function(y){
xtabstat(x,y)
})
}
)
heatmap(s)