마케팅 실무형 빅데이터 전문 인력 양성 커뮤니티의 시대,
함께 성장하는 ‘같이’의 가치############################################################
## 주성분분석 Principal Component Analysis(PCA) ##
############################################################
## PCA and Dimension-Reduction
## Several functions from different packages are available
# in the R software for computing PCA:
# prcomp() [built-in R stats package],
# princomp() [built-in R stats package],
# PCA() [FactoMineR package],
# dudi.pca() [ade4 package],
# epPCA() [ExPosition package],
# principal() [psych package].
## FactoMineR: Multivariate Exploratory Data Analysis
# and Data Mining
# ade4: Analysis of Ecological Data: Exploratory and
# Euclidean Methods in Environmental Sciences
# ExPosition: Exploratory Analysis with the Singular
# Value Decomposition
# psych: Procedures for Psychological, Psychometric,
# and Personality Research
# factoextra: Extract and Visualize the Results of
# Multivariate Data Analyses
## No matter what function you decide to use,
# you can easily extract and visualize the results of PCA
# using R functions provided in the factoextra R package.
install.packages("FactoMineR"); library("FactoMineR")
install.packages("ade4"); library("ade4")
install.packages("ExPosition"); library("ExPosition")
install.packages("psych"); library("psych")
install.packages("factoextra"); library("factoextra")
###1 Example Data: Attitude
Attitude <- read.csv("중간관리자태도조사자료.csv")
head(Attitude)
Xdata <- Attitude[,2:13]
rownames(Xdata) <- as.character(Attitude[,1])
colnames(Xdata) <- as.character(Questions)
Questions <- Attitude[2:13,15]
head(Xdata)
str(Xdata)
X <- Xdata
for(i in 1:ncol(X)){
colnames(X)[i] <- strsplit(colnames(X)[i],split='=')[[1]][1]
}
X
###2 Perform PCA using the function PCA in 'FactoMineR' package
?PCA
# The function PCA performs Principal Component Analysis (PCA)
# with supplementary individuals,
# supplementary quantitative variables
# and supplementary categorical variables.
# Missing values are replaced by the column mean.
library("corrplot")
corrplot(cor(X),order="AOE")
?corrplot
# A graphical display of a correlation matrix, confidence
# interval. The details are paid great attention to. It can
# also visualize a general matrix by setting is.corr = FALSE.
PCA(X,
scale.unit=TRUE,
ncp=12,
graph=TRUE)
Questions
newX <- X
newX[,c(5,10,11,12)] <- 6-X[,c(5,10,11,12)]
colnames(newX)[c(5,10,11,12)] <- c("y5","y10","y11","y12")
newQuestions <- Questions
newQuestions[c(5,10,11,12)] <-
c("y5 ='내 직장 일은 지루하지 않다'",
"y10='나는 밑에 여러 직원을 두는 자리를 좋아한다'",
"y11='나는 이 직장에서 내가 해낸 일에 만족하지 않는다'",
"y12='보수가 더 좋은 직장이 있더라도 옮기지 않겠다'")
newX
newQuestions
str(newX)
corrplot(cor(newX),order="AOE")
PCA(newX,
scale.unit=TRUE,
ncp=12,
graph=TRUE)
PCA(newX,
scale.unit=TRUE,
ncp=11,
quali.sup=1,
ind.sup=13,
graph=TRUE)
## PCA terminology
# Active individuals: Individuals that are used in the PCA.
# Active variables: Variables that are used in the PCA.
# Supplementary Individuals (ind.sup):
# They dont participate to the PCA.
# The coordinates of these Individuals will be predicted.
# Supplementary quantitative variables (quanti.sup):
# Supplementary qualitative variables (quali.sup):
# This factor variables will be used to color individuals by groups.
res.pca <- PCA(newX,
scale.unit=TRUE,
ncp=11,
quali.sup=1,
ind.sup=13,
graph=TRUE)
res.pca
str(res.pca)
###3 Visualization and interpretation of PCA results
###3-1 Eigen-values / Variance / Scree-plot
eig.val <- get_eigenvalue(res.pca)
str(eig.val)
eig.val
fviz_eig(res.pca,
addlabels=TRUE,
ylim=c(0,50))
###3-2 PCA Results for variables
var <- get_pca_var(res.pca)
str(var)
var
var$coord # Coordinates for the Variables
var$cor # Correlation between variables and dimensions
var$cos2 # Quality for the variables on the factor map
var$contrib # Contributions of the variables to the PCs
apply(var$cos2,1,sum); apply(var$cos2[,1:2],1,sum)
apply(var$contrib,2,sum); apply(var$contrib[,1:2],2,sum)
fviz_pca_var(res.pca,
col.var="black")
corrplot(var$cos2, is.corr=FALSE)
# Total cos2 of variables on Dim.1 and Dim.2
fviz_cos2(res.pca,
choice="var",
axes=1:2)
fviz_pca_var(res.pca,
col.var="cos2",
gradient.cols=c("#00AFBB", "#E7B800", "#FC4E07"),
repel=TRUE)
# Change the transparency by cos2 values
fviz_pca_var(res.pca,
axes=c(1,2),
alpha.var="cos2")
corrplot(var$contrib, is.corr=FALSE)
# Contributions of variables to PC1
fviz_contrib(res.pca,
choice="var",
axes=1)
# Contributions of variables to PC2
fviz_contrib(res.pca,
choice="var",
axes=2)
# The total contribution to PC1 and PC2
fviz_contrib(res.pca,
choice="var",
axes=1:2,
top=11)
fviz_pca_var(res.pca,
col.var="contrib",
gradient.cols=c("#00AFBB", "#E7B800", "#FC4E07"))
# Create a grouping variable using kmeans
# Create 3 groups of variables (centers=3)
set.seed(123)
res.km <- kmeans(var$coord, centers=3, nstart=25)
grp <- as.factor(res.km$cluster)
# Color variables by groups
fviz_pca_var(res.pca,
col.var=grp,
palette=c("yellow", "blue", "green","pink"),
legend.title="Cluster")
res.desc <- dimdesc(res.pca,
axes=c(1:2),
proba=0.05)
res.desc$Dim.1 # Description of dimension 1
print(round(res.desc$Dim.1$quanti,digits=3))
res.desc$Dim.2 # Description of dimension 2
print(round(res.desc$Dim.2$quanti,digits=3))
###3-2 PCA Results for individuals
ind <- get_pca_ind(res.pca)
str(ind)
ind
ind$coord # Coordinates of individuals
ind$cos2 # Quality of individuals
ind$contrib # Contributions of individuals
fviz_pca_ind(res.pca)
fviz_pca_ind(res.pca,
col.ind="cos2",
gradient.cols=c("#00AFBB", "#E7B800", "#FC4E07"),
repel=TRUE)
fviz_pca_ind(res.pca,
pointsize="cos2",
pointshape=21,
fill="#E7B800",
repel=TRUE)
fviz_pca_ind(res.pca,
col.ind="cos2",
pointsize="cos2",
gradient.cols=c("#00AFBB", "#E7B800", "#FC4E07"),
repel=TRUE)
fviz_cos2(res.pca,
choice="ind",axes=1) # contribution on PC1
fviz_cos2(res.pca,
choice="ind",axes=2) # contribution on PC2
# Total contribution on PC1 and PC2
fviz_contrib(res.pca,
choice="ind",
axes=1:2)
# Create a random continuous variable of length 19,
# Same length as the number of active individuals in the PCA
set.seed(123)
my.cont.var <- rnorm(18)
# Color individuals by the continuous variable
fviz_pca_ind(res.pca,
col.ind=my.cont.var,
gradient.cols=c("blue", "yellow", "red"),
legend.title="Cont.Var")
fviz_pca_ind(res.pca,
col.ind=newX[-13,1],
gradient.cols=c("blue", "yellow", "red"),
legend.title="Cont.Var")
# Variables on dimensions 1 and 2
fviz_pca_var(res.pca,
axes=c(1,2))
# Individuals on dimensions 1 and 2
fviz_pca_ind(res.pca,
axes=c(1,2))
# Show variable points and text labels
fviz_pca_var(res.pca,
geom.var=c("point", "text"))
# Show individuals text labels only
fviz_pca_ind(res.pca,
geom.ind="text")
# Change the size of arrows an labels
fviz_pca_var(res.pca,
arrowsize=1,
labelsize=5,
repel=TRUE)
# Change points size, shape and fill color
# Change labelsize
fviz_pca_ind(res.pca,
pointsize=3,
pointshape=21,
fill="lightblue",
labelsize=5,
repel=TRUE)
fviz_pca_var(res.pca,
axes.linetype="blank")
fviz_pca_biplot(res.pca,
repel = TRUE,
col.var="#2E9FDF",
col.ind="#696969")
###4 Print/Save the PCA Results
## Print the plot to a pdf file
pdf("myplot.pdf")
print(myplot)
dev.off()
scree.plot <- fviz_eig(res.pca) # Scree plot
ind.plot <- fviz_pca_ind(res.pca) # Plot of individuals
var.plot <- fviz_pca_var(res.pca) # Plot of variables
pdf("PCA.pdf") # Create a new pdf device
print(scree.plot)
print(ind.plot)
print(var.plot)
dev.off() # Close the pdf device
png("pca-scree-plot.png")
print(scree.plot)
dev.off()
png("pca-variables.png")
print(var.plot)
dev.off()
png("pca-individuals.png")
print(ind.plot)
dev.off()
## 'ggpubr' provides some easy-to-use functions for creating
# and customizing 'ggplot2'- based publication ready plots.
library(ggpubr)
ggexport(plotlist=list(scree.plot, ind.plot, var.plot),
filename="PCA.pdf")
ggexport(plotlist=list(scree.plot, ind.plot, var.plot),
nrow=2,
ncol=2,
filename="PCA.pdf")
ggexport(plotlist=list(scree.plot, ind.plot, var.plot),
filename="PCA.png")
###참고: Perform Factorial Analysis from 'FactoMineR'
# with a Shiny Application
library(Factoshiny)
data(decathlon)
res.shiny <- PCAshiny(decathlon)
#####################################################
# Pre-Season 1985 College Basketball Rankings #
#####################################################
text <- "
Louisville 1 8 1 9 8 9 6 10 9 9
Georgia Tech 2 2 4 3 1 1 1 2 1 1
Kansas 3 4 5 1 5 11 8 4 5 7
Michigan 4 5 9 4 2 5 3 1 3 2
Duke 5 6 7 5 4 10 4 5 6 5
UNC 6 1 2 2 3 4 2 3 2 3
Syracuse 7 10 6 11 6 6 5 6 4 10
Notre Dame 8 14 15 13 11 20 18 13 12 .
Kentucky 9 15 16 14 14 19 11 12 11 13
LSU 10 9 13 . 13 15 16 9 14 8
DePaul 11 . 21 15 20 . 19 . . 19
Georgetown 12 7 8 6 9 2 9 8 8 4
Navy 13 20 23 10 18 13 15 . 20 .
Illinois 14 3 3 7 7 3 10 7 7 6
Iowa 15 16 . . 23 . . 14 . 20
Arkansas 16 . . . 25 . . . . 16
Memphis State 17 . 11 . 16 8 20 . 15 12
Washington 18 . . . . . . 17 . .
UAB 19 13 10 . 12 17 . 16 16 15
UNLV 20 18 18 19 22 . 14 18 18 .
NC State 21 17 14 16 15 . 12 15 17 18
Maryland 22 . . . 19 . . . 19 14
Pittsburgh 23 . . . . . . . . .
Oklahoma 24 19 17 17 17 12 17 . 13 17
Indiana 25 12 20 18 21 . . . . .
Virginia 26 . 22 . . 18 . . . .
Old Dominion 27 . . . . . . . . .
Auburn 28 11 12 8 10 7 7 11 10 11
St. Johns 29 . . . . 14 . . . .
UCLA 30 . . . . . . 19 . .
St. Joseph's . . 19 . . . . . . .
Tennessee . . 24 . . 16 . . . .
Montana . . . 20 . . . . . .
Houston . . . . 24 . . . . .
Virginia Tech . . . . . . 13 . . ."
data2 <- read.fwf(textConnection(text),
widths = c(13,rep(3,10)),
header = FALSE,
na.strings=" .",
skip=1)
Xdata <- data2[,-1]
rownames(Xdata) <- data2[,1]
colnames(Xdata) <- paste("x",sep="",1:10)
rm(data2)
News_names <- c("Community Sports News",
"Durham Sun",
"Durham Morning Herald",
"Washington Post",
"USA Today",
"Sports Magazine",
"Inside Sports",
"United Press International",
"Associated Press",
"Sports Illustrated")
Xdata
max_ranks <- apply(Xdata,2,max,na.rm=T)
imputed_ranks <- max_ranks + (36-max_ranks)/2
imputed_ranks[4] <- mean(c(12,21:35))
weights <- 10-apply(apply(Xdata,2,is.na),1,sum)
for(j in 1:10) {
for(i in 1:35) {
if(is.na(Xdata[i,j])) Xdata[i,j] <- imputed_ranks[j]
}
}
X <- Xdata
summary(X)
round(apply(X,2,mean),digits=2)
round(apply(X,2,var), digits=2)
Y <- scale(X,center=T,scale=F)
prcomp_out <- prcomp(Y)
str(prcomp_out)
princomp_out <- princomp(Y)
str(princomp_out)
out.PCA <- PCA(Y,
scale.unit=FALSE,
ncp=3,
row.w=weights,
graph=TRUE)
out.PCA
str(out.PCA)
get_eigenvalue(out.PCA)
fviz_eig(out.PCA,addlabels=TRUE,ylim=c(0,100))
fviz_pca_var(out.PCA)
fviz_pca_ind(out.PCA,repel=TRUE)
score <- sort(out.PCA$ind$coord[,1])
plot(score)
install.packages('psych')
library(psych)
help(package="psych")
cor.wt_X_out <- cor.wt(X,w=weights)
str(cor.wt_X_out)
out <- principal(cor.wt_X_out$r,nfactors=1)
str(out)
out$Vaccounted
str(out$loadings)
out_score_X <- predict(out, X)
str(out_score_X)
names(out_score_X) <- rownames(X)
out_score_X <- sort(out_score_X)
plot(out_score_X)