线性回归机器学习任务,作出λ参数与J_{train} 与J_{validation}的关系示意图

视频:

链接:https://pan.baidu.com/s/1ggE8q2J 密码:c97o

视频里面针对同一个因变量把因变量作为:

2分类(均匀分类)

6分类(均匀分类)

原始的连续数据

进行训练,数据来自FIFA-18 的数据库,自变量是:

potential
pac
sho
pas
dri
def
phy

因变量是周薪

2分类

 

 

两分类

6分类

 

六分类

原始数据——连续

连续

 

 

## Initialization
rm(list=ls())
setwd("C:\\Users\\57890\\Desktop\0.Bike Sharing\\")
sources <- c("linearRegCostFunction.R",
"trainLinearReg.R","mapFeature.R")

for (i in 1:length(sources)) {
cat(paste("Loading ",sources[i],"\n"))
source(sources[i])
}

## ------------- Part 1: Loading and Visualizing Data --------------

# Load Training Data
cat(sprintf('Loading and Visualizing Data ...\n'))

## data{train} and data{test}
str(data<-as.data.frame(data <- read.spss("C:\\Users\\57890\\Desktop\\tongji\\zhuqiu.sav")))

#N=1e5; idx <- sample(n<-nrow(data),N*2,replace = T)
n <- (nrow(data)+1)/2
data.tr <- data[1:n/2,]
data.va <- data[n/2+1:n,]

# training sets
X.tr <- data.tr[,c(9,15)]; y.tr <- scale(data.tr[,83])
X.tr <- scale(X.tr)
X.tr <- as.matrix(X.tr)
X.tr <- mapFeature(X.tr[,1],X.tr[,2])
# X.tr <- cbind(X.tr,data.tr[,7])

# test sets
X.va <- data.va[,c(9,15)]; y.va <- scale(data.va[,83])
X.va <- scale(X.va)
X.va <- as.matrix(X.va)
X.va <- mapFeature(X.va[,1],X.va[,2])
# X.va <- cbind(X.va,data.va[,7])

## ------------- Part 2: Regularized Linear Regression Cost --------------

theta <- rep(1,dim(X.va)[2])
(J <- linearRegCostFunction(X.tr, y.tr, 1)(theta))

## ------------- Part 3: Regularized Linear Regression Gradient --------------
theta <- rep(1,dim(X.va)[2])
(grad <- linearRegGradFunction(cbind(X.tr), y.tr, 1)(theta))
## ------------- Part 4: Train Linear Regression --------------

# parameters
theta <- rep(1,dim(X.va)[2])
lambdaset <- c(1:100,200,1000)

d = data.frame()
for (lambda in lambdaset) {
optim.tr <- trainLinearReg(X.tr, y.tr, lambda)
(J.tr <- optim.tr$value)
(theta <- optim.tr$par)
(J.va <- linearRegCostFunction(X.va, y.va, lambda)(theta))
d = rbind(d, data.frame(round(lambda,0), J.tr, J.va))
}
colnames(d)[1] <- "lambda"
colnames(d)[2] <- "J.tr"
colnames(d)[3] <- "J.va"
d

# visualization
#library(ggplot2)
#library(reshape2)
d2 <- melt(data = d, id = "lambda")
ggplot(d2,
aes(x= lambda, y= value, group = variable, colour = variable))+
geom_point()+
geom_line()+
labs(x = "lambda",
y = "J",
title = "Training & Validation Cost"
)

ggplot(d,aes(x = lambda, y = J.tr, colour = J.tr)) +
geom_point() +
geom_line() +
labs(x = "lambda",
y = "J{train}",
title = "Training Cost")

ggplot(d,aes(x = lambda, y = J.va, colour = J.va)) +
geom_point() +
geom_line() +
labs(x = "lambda",
y = "J{validation}",
title = "Validation Cost")

示范一组求gl效应的SPSSSYNTAX练习,与R的结果比对

视频:

链接:https://pan.baidu.com/s/1c3T73Fy 密码:8fd4

找到一个特别好用的函数)

linearHypothesis(model,hypothesis.matrix)

hypothesis.matrix可以直接把Lmatrix的内容复制出来,并且不需要单独列出不等于0的行和列,这个语句貌似还是R里面自带的。

library(Rcmdr)
Dataset <-
read.table("C:/Users/57890/Desktop/world-happiness-report/2015.csv",
header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
Dataset$e1 <- with(Dataset, binVariable(Health..Life.Expectancy., bins=2,
method='proportions', labels=c('t1','t2')))
Dataset$g1 <- with(Dataset, binVariable(Economy..GDP.per.Capita., bins=3,
method='proportions', labels=c('f1','f2','f3')))
attach(Dataset)
lm1<-lm(Happiness.Score~0+e1+g1)
Anova(lm1,type = 3)

#paired comparesion
library(multcomp)
lm2<-lm(Happiness.Score~0+e1:g1)
L=c(1,0,0,-1,0,0)
linearHypothesis(model = lm2,hypothesis.matrix =L)
ht2<-glht(lm2,c("e1t1:g1f1 - e1t2:g1f1== 0"))
confint(ht2)
#main effect
L1=c(1,-1,0,0,0,0)
L2=c(1,0,-1,0,0,0)
L3=c(0,1,-1,0,0,0)
linearHypothesis(model = lm2,hypothesis.matrix =L1)
linearHypothesis(model = lm2,hypothesis.matrix =L2)
linearHypothesis(model = lm2,hypothesis.matrix =L2)
ht3<-glht(lm2,c("e1t1:g1f1 - e1t1:g1f2== 0"))
ht4<-glht(lm2,c("e1t1:g1f2 - e1t1:g1f3== 0"))
ht5<-glht(lm2,c("e1t1:g1f1 - e1t1:g1f3== 0"))
confint(ht3)
confint(ht4)
confint(ht5)

常见的t检验Effect Size置信区间、样本图示、针对paired的简单斜率置信区间图和分布比较图的t-testplus代码

录屏使用数据:mtcars:drat  wt 做paired t-test

链接:https://pan.baidu.com/s/1hsw7LGS 密码:v5fg

内容:

t-testplus

t检验Effect Size置信区间

t检验ncp置信区间

样本图示

针对paired的简单斜率置信区间图

针对paired的分布比较图

代码如下

t.testplus<-function(x, y = NULL, alternative = c("two.sided", "less", "greater"),
mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95,
...){
origin<-function (x, y = NULL, alternative = c("two.sided", "less", "greater"),
mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95,
...)
{
alternative <- match.arg(alternative)
if (!missing(mu) && (length(mu) != 1 || is.na(mu)))
stop("'mu' must be a single number")
if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) ||
conf.level < 0 || conf.level > 1))
stop("'conf.level' must be a single number between 0 and 1")
if (!is.null(y)) {
dname <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))
if (paired)
xok <- yok <- complete.cases(x, y)
else {
yok <- !is.na(y)
xok <- !is.na(x)
}
y <- y[yok]
}
else {
dname <- deparse(substitute(x))
if (paired)
stop("'y' is missing for paired test")
xok <- !is.na(x)
yok <- NULL
}
x <- x[xok]
if (paired) {
x <- x - y
y <- NULL
}
nx <- length(x)
mx <- mean(x)
vx <- var(x)
if (is.null(y)) {
if (nx < 2)
stop("not enough 'x' observations")
df <- nx - 1
stderr <- sqrt(vx/nx)
if (stderr < 10 * .Machine$double.eps * abs(mx))
stop("data are essentially constant")
tstat <- (mx - mu)/stderr
method <- if (paired)
"Paired t-test"
else "One Sample t-test"
estimate <- setNames(mx, if (paired)
"mean of the differences"
else "mean of x")
}
else {
ny <- length(y)
if (nx < 1 || (!var.equal && nx < 2))
stop("not enough 'x' observations")
if (ny < 1 || (!var.equal && ny < 2))
stop("not enough 'y' observations")
if (var.equal && nx + ny < 3)
stop("not enough observations")
my <- mean(y)
vy <- var(y)
method <- paste(if (!var.equal)
"Welch", "Two Sample t-test")
estimate <- c(mx, my)
names(estimate) <- c("mean of x", "mean of y")
if (var.equal) {
df <- nx + ny - 2
v <- 0
if (nx > 1)
v <- v + (nx - 1) * vx
if (ny > 1)
v <- v + (ny - 1) * vy
v <- v/df
stderr <- sqrt(v * (1/nx + 1/ny))
}
else {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny -
1))
}
if (stderr < 10 * .Machine$double.eps * max(abs(mx),
abs(my)))
stop("data are essentially constant")
tstat <- (mx - my - mu)/stderr
}
if (alternative == "less") {
pval <- pt(tstat, df)
cint <- c(-Inf, tstat + qt(conf.level, df))
}
else if (alternative == "greater") {
pval <- pt(tstat, df, lower.tail = FALSE)
cint <- c(tstat - qt(conf.level, df), Inf)
}
else {
pval <- 2 * pt(-abs(tstat), df)
alpha <- 1 - conf.level
cint <- qt(1 - alpha/2, df)
cint <- tstat + c(-cint, cint)
}
cint <- mu + cint * stderr
names(tstat) <- "t"
names(df) <- "df"
names(mu) <- if (paired || !is.null(y))
"difference in means"
else "mean"
attr(cint, "conf.level") <- conf.level
rval <- list(statistic = tstat, parameter = df, p.value = pval,
conf.int = cint, estimate = estimate, null.value = mu,
alternative = alternative, method = method, data.name = dname)
class(rval) <- "htest"
return(rval)
}

CI.ncp <- function(t, N){

f <- function (ncp, alpha, q, df) {
abs(suppressWarnings(pt(q = t, df = N - 1, ncp, lower.tail = FALSE)) - alpha) }

sapply(c(0.025, 0.975),
function(x) optim(1, f, alpha = x, q = t, df = N - 1, control = list(reltol = (.Machine$double.eps)))[[1]]) }

CI.ES<-function(ncp,df){
CIes<-ncp/(df^0.5)
}
tt<-origin(x, y, ...)
tt
CIncp<-CI.ncp(tt$statistic,tt$parameter+1)
CIES<-CI.ES(CIncp,tt$parameter)

names(CIncp)=c("CI of ncp:"," ")
names(CIES)=c("CI of ES:"," ")
show(list(tt,CIncp,CIES))
plot(x,y,main = "sample plot",... )

}

```
```{r}

boxplot(rep(1:100,2),ds$id)
alpha = c(0.05, 0.01, 0.001)
symbols = c('***', '**', '*', '')
which_symbol = as.numeric(cut(tt$p.value, c(0, rev(alpha), 1)))
ypos = max(d[rep(1, length(d)) == levels(rep(1, length(d)))[2]])
text(x = 2, y = ypos, symbols[which_symbol], cex = 1.5, adj = c(0.5, 0))
```

```{r}

t.testplus<-function(x, y = NULL, alternative = c("two.sided", "less", "greater"),
mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95,
...){
origin<-function (x, y = NULL, alternative = c("two.sided", "less", "greater"),
mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95,
...)
{
alternative <- match.arg(alternative)
if (!missing(mu) && (length(mu) != 1 || is.na(mu)))
stop("'mu' must be a single number")
if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) ||
conf.level < 0 || conf.level > 1))
stop("'conf.level' must be a single number between 0 and 1")
if (!is.null(y)) {
dname <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))
if (paired)
xok <- yok <- complete.cases(x, y)
else {
yok <- !is.na(y)
xok <- !is.na(x)
}
y <- y[yok]
}
else {
dname <- deparse(substitute(x))
if (paired)
stop("'y' is missing for paired test")
xok <- !is.na(x)
yok <- NULL
}
x <- x[xok]
if (paired) {
x <- x - y
y <- NULL
}
nx <- length(x)
mx <- mean(x)
vx <- var(x)
if (is.null(y)) {
if (nx < 2)
stop("not enough 'x' observations")
df <- nx - 1
stderr <- sqrt(vx/nx)
if (stderr < 10 * .Machine$double.eps * abs(mx))
stop("data are essentially constant")
tstat <- (mx - mu)/stderr
method <- if (paired)
"Paired t-test"
else "One Sample t-test"
estimate <- setNames(mx, if (paired)
"mean of the differences"
else "mean of x")
}
else {
ny <- length(y)
if (nx < 1 || (!var.equal && nx < 2))
stop("not enough 'x' observations")
if (ny < 1 || (!var.equal && ny < 2))
stop("not enough 'y' observations")
if (var.equal && nx + ny < 3)
stop("not enough observations")
my <- mean(y)
vy <- var(y)
method <- paste(if (!var.equal)
"Welch", "Two Sample t-test")
estimate <- c(mx, my)
names(estimate) <- c("mean of x", "mean of y")
if (var.equal) {
df <- nx + ny - 2
v <- 0
if (nx > 1)
v <- v + (nx - 1) * vx
if (ny > 1)
v <- v + (ny - 1) * vy
v <- v/df
stderr <- sqrt(v * (1/nx + 1/ny))
}
else {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny -
1))
}
if (stderr < 10 * .Machine$double.eps * max(abs(mx),
abs(my)))
stop("data are essentially constant")
tstat <- (mx - my - mu)/stderr
}
if (alternative == "less") {
pval <- pt(tstat, df)
cint <- c(-Inf, tstat + qt(conf.level, df))
}
else if (alternative == "greater") {
pval <- pt(tstat, df, lower.tail = FALSE)
cint <- c(tstat - qt(conf.level, df), Inf)
}
else {
pval <- 2 * pt(-abs(tstat), df)
alpha <- 1 - conf.level
cint <- qt(1 - alpha/2, df)
cint <- tstat + c(-cint, cint)
}
cint <- mu + cint * stderr
names(tstat) <- "t"
names(df) <- "df"
names(mu) <- if (paired || !is.null(y))
"difference in means"
else "mean"
attr(cint, "conf.level") <- conf.level
rval <- list(statistic = tstat, parameter = df, p.value = pval,
conf.int = cint, estimate = estimate, null.value = mu,
alternative = alternative, method = method, data.name = dname)
class(rval) <- "htest"
return(rval)
}

CI.ncp <- function(t, N){

f <- function (ncp, alpha, q, df) {
abs(suppressWarnings(pt(q = t, df = N - 1, ncp, lower.tail = FALSE)) - alpha) }

sapply(c(0.025, 0.975),
function(x) optim(1, f, alpha = x, q = t, df = N - 1, control = list(reltol = (.Machine$double.eps)))[[1]]) }

CI.ES<-function(ncp,df){
CIes<-ncp/(df^0.5)
}
tt<-origin(x, y, ...)
tt
CIncp<-CI.ncp(tt$statistic,tt$parameter+1)
CIES<-CI.ES(CIncp,tt$parameter)
names(CIncp)=c("CI of ncp:"," ")
names(CIES)=c("CI of ES:"," ")
show(list(tt,CIncp,CIES))
layout(1:3)

if(paired) {
xz <- seq(from = min(x,y), to = max(x,y), by = (max(x,y)-min(x,y))/100)
y1 <- dnorm(xz, mean = mean(x), sd =sd(x))
y2 <- dnorm(xz, mean = mean(y), sd =sd(y))
plot(xz, y1, type="l", lwd=2, col="red",
main="distribution campare",
ylab = "Frequency", yaxt="n",xlab = NULL,
xlim = c(min(x,y), max(x,y)), ylim = c(0, 1))
lines(xz, y2)
polygon(c(min(x,y),xz, max(x,y)),c(0,y2,0), col="firebrick3",
border = "black")
polygon(c(min(x,y),xz, max(x,y)),c(0,y1,0), col="dodgerblue4",
border = "black")
ylab=c(seq(from=0, to= max(x,y), by= (max(x,y)-min(x,y)/10)))
yz=c(seq(from=0, to=0.05, length.out = 8))

text(x =min(x,y)+  ((max(x,y)-min(x,y))/10), y = 0.8, "x", col = "dodgerblue4", cex = 0.9)
text(x = min(x,y)+  ((max(x,y)-min(x,y))/10), y = 0.9, "y", col = "firebrick3", cex = 0.9)
}
plot(x,y,main = "sample plot",... )
if(paired) {
plot(y ~ x, pch = 16, col = 'red')

model1 <- lm(y ~ x)

pre <- predict(model1, interval = 'prediction')

data.pre <- data.frame(pre)

lines(data.pre$lwr ~ x, col = 'blue', lty = 2)

lines(data.pre$upr ~ x, col = 'blue', lty = 2)
}
}

 

R中编码实现,作出偏相关以及中介作用的主要结果Bootstrap的置信区间。

视频:链接:http://pan.baidu.com/s/1pLoj6o7 密码:og0g

源代码如下:数据针对L26课ex1

偏相关:

library(RcmdrMisc)
library(car)
Dataset<-readSPSS(choose.files())
str(Dataset)
LAST<-array(dim = 1e4)
for (k in 1:1e4) {
x<-sample(nrow(Dataset),10,replace = T)
Data2<-Dataset[x,]
model1<-lm(Data2$timing~Data2$carbor)
summary(model1)
summary1<-summary(model1)
resid1<-summary1$residuals

model2<-lm(Data2$points~Data2$carbor)
summary(model2)
summary2<-summary(model2)
resid2<-summary2$residuals

LAST[k]<-cor(resid1,resid2)

}

hist(LAST,probability = T)
lines(density(LAST),lty=2)
quantile(LAST,c(.025,.975))

 

中介:

LAST2<-array(dim = 1e4)
for (k in 1:1e4) {
x<-sample(nrow(Dataset),10,replace = T)
Data2<-Dataset[x,]

modelA<-cor(Data2$timing,Data2$carbor)
modelB<-cor(Data2$points,Data2$timing)
LAST2[k]<-modelA*modelB

}

hist(LAST2,probability = T)
lines(density(LAST2),lty=2)
quantile(LAST2,c(.025,.975))