본문 바로가기 사이드메뉴 바로가기 대메뉴 바로가기

목원대학교마케팅빅데이터학과


마케팅 실무형 빅데이터 전문 인력 양성 커뮤니티의 시대,

함께 성장하는 ‘같이’의 가치

강의게시판

PCA

작성자 이** 작성일 2025.04.08 조회수20

############################################################

## 주성분분석 Principal Component Analysis(PCA) ##

############################################################



## PCA and Dimension-Reduction



## Several functions from different packages are available

# in the R software for computing PCA:

# prcomp() [built-in R stats package],

# princomp() [built-in R stats package],

# PCA() [FactoMineR package],

# dudi.pca() [ade4 package],

# epPCA() [ExPosition package],

# principal() [psych package].



## FactoMineR: Multivariate Exploratory Data Analysis

# and Data Mining

# ade4: Analysis of Ecological Data: Exploratory and

# Euclidean Methods in Environmental Sciences

# ExPosition: Exploratory Analysis with the Singular

# Value Decomposition

# psych: Procedures for Psychological, Psychometric,

# and Personality Research

# factoextra: Extract and Visualize the Results of

# Multivariate Data Analyses



## No matter what function you decide to use,

# you can easily extract and visualize the results of PCA

# using R functions provided in the factoextra R package.



install.packages("FactoMineR"); library("FactoMineR")

install.packages("ade4"); library("ade4")

install.packages("ExPosition"); library("ExPosition")

install.packages("psych"); library("psych")

install.packages("factoextra"); library("factoextra")



###1 Example Data: Attitude

Attitude <- read.csv("중간관리자태도조사자료.csv")

head(Attitude)

Xdata <- Attitude[,2:13]

rownames(Xdata) <- as.character(Attitude[,1])

colnames(Xdata) <- as.character(Questions)

Questions <- Attitude[2:13,15]

head(Xdata)

str(Xdata)



X <- Xdata

for(i in 1:ncol(X)){

colnames(X)[i] <- strsplit(colnames(X)[i],split='=')[[1]][1]

}

X



###2 Perform PCA using the function PCA in 'FactoMineR' package



?PCA

# The function PCA performs Principal Component Analysis (PCA)

# with supplementary individuals,

# supplementary quantitative variables

# and supplementary categorical variables.

# Missing values are replaced by the column mean.



library("corrplot")

corrplot(cor(X),order="AOE")

?corrplot

# A graphical display of a correlation matrix, confidence

# interval. The details are paid great attention to. It can

# also visualize a general matrix by setting is.corr = FALSE.



PCA(X,

scale.unit=TRUE,

ncp=12,

graph=TRUE)

Questions



newX <- X

newX[,c(5,10,11,12)] <- 6-X[,c(5,10,11,12)]

colnames(newX)[c(5,10,11,12)] <- c("y5","y10","y11","y12")

newQuestions <- Questions

newQuestions[c(5,10,11,12)] <-

c("y5 ='내 직장 일은 지루하지 않다'",

"y10='나는 밑에 여러 직원을 두는 자리를 좋아한다'",

"y11='나는 이 직장에서 내가 해낸 일에 만족하지 않는다'",

"y12='보수가 더 좋은 직장이 있더라도 옮기지 않겠다'")

newX

newQuestions

str(newX)



corrplot(cor(newX),order="AOE")



PCA(newX,

scale.unit=TRUE,

ncp=12,

graph=TRUE)



PCA(newX,

scale.unit=TRUE,

ncp=11,

quali.sup=1,

ind.sup=13,

graph=TRUE)



## PCA terminology

# Active individuals: Individuals that are used in the PCA.

# Active variables: Variables that are used in the PCA.

# Supplementary Individuals (ind.sup):

# They dont participate to the PCA.

# The coordinates of these Individuals will be predicted.

# Supplementary quantitative variables (quanti.sup):

# Supplementary qualitative variables (quali.sup):

# This factor variables will be used to color individuals by groups.



res.pca <- PCA(newX,

scale.unit=TRUE,

ncp=11,

quali.sup=1,

ind.sup=13,

graph=TRUE)

res.pca

str(res.pca)





###3 Visualization and interpretation of PCA results



###3-1 Eigen-values / Variance / Scree-plot

eig.val <- get_eigenvalue(res.pca)

str(eig.val)

eig.val



fviz_eig(res.pca,

addlabels=TRUE,

ylim=c(0,50))



###3-2 PCA Results for variables

var <- get_pca_var(res.pca)

str(var)

var



var$coord # Coordinates for the Variables

var$cor # Correlation between variables and dimensions

var$cos2 # Quality for the variables on the factor map

var$contrib # Contributions of the variables to the PCs

apply(var$cos2,1,sum); apply(var$cos2[,1:2],1,sum)

apply(var$contrib,2,sum); apply(var$contrib[,1:2],2,sum)



fviz_pca_var(res.pca,

col.var="black")



corrplot(var$cos2, is.corr=FALSE)



# Total cos2 of variables on Dim.1 and Dim.2

fviz_cos2(res.pca,

choice="var",

axes=1:2)



fviz_pca_var(res.pca,

col.var="cos2",

gradient.cols=c("#00AFBB", "#E7B800", "#FC4E07"),

repel=TRUE)



# Change the transparency by cos2 values

fviz_pca_var(res.pca,

axes=c(1,2),

alpha.var="cos2")



corrplot(var$contrib, is.corr=FALSE)



# Contributions of variables to PC1

fviz_contrib(res.pca,

choice="var",

axes=1)

# Contributions of variables to PC2

fviz_contrib(res.pca,

choice="var",

axes=2)



# The total contribution to PC1 and PC2

fviz_contrib(res.pca,

choice="var",

axes=1:2,

top=11)



fviz_pca_var(res.pca,

col.var="contrib",

gradient.cols=c("#00AFBB", "#E7B800", "#FC4E07"))



# Create a grouping variable using kmeans

# Create 3 groups of variables (centers=3)

set.seed(123)

res.km <- kmeans(var$coord, centers=3, nstart=25)

grp <- as.factor(res.km$cluster)

# Color variables by groups

fviz_pca_var(res.pca,

col.var=grp,

palette=c("yellow", "blue", "green","pink"),

legend.title="Cluster")



res.desc <- dimdesc(res.pca,

axes=c(1:2),

proba=0.05)

res.desc$Dim.1 # Description of dimension 1

print(round(res.desc$Dim.1$quanti,digits=3))

res.desc$Dim.2 # Description of dimension 2

print(round(res.desc$Dim.2$quanti,digits=3))



###3-2 PCA Results for individuals

ind <- get_pca_ind(res.pca)

str(ind)

ind



ind$coord # Coordinates of individuals

ind$cos2 # Quality of individuals

ind$contrib # Contributions of individuals



fviz_pca_ind(res.pca)



fviz_pca_ind(res.pca,

col.ind="cos2",

gradient.cols=c("#00AFBB", "#E7B800", "#FC4E07"),

repel=TRUE)



fviz_pca_ind(res.pca,

pointsize="cos2",

pointshape=21,

fill="#E7B800",

repel=TRUE)



fviz_pca_ind(res.pca,

col.ind="cos2",

pointsize="cos2",

gradient.cols=c("#00AFBB", "#E7B800", "#FC4E07"),

repel=TRUE)





fviz_cos2(res.pca,

choice="ind",axes=1) # contribution on PC1

fviz_cos2(res.pca,

choice="ind",axes=2) # contribution on PC2



# Total contribution on PC1 and PC2

fviz_contrib(res.pca,

choice="ind",

axes=1:2)



# Create a random continuous variable of length 19,

# Same length as the number of active individuals in the PCA

set.seed(123)

my.cont.var <- rnorm(18)

# Color individuals by the continuous variable

fviz_pca_ind(res.pca,

col.ind=my.cont.var,

gradient.cols=c("blue", "yellow", "red"),

legend.title="Cont.Var")

fviz_pca_ind(res.pca,

col.ind=newX[-13,1],

gradient.cols=c("blue", "yellow", "red"),

legend.title="Cont.Var")



# Variables on dimensions 1 and 2

fviz_pca_var(res.pca,

axes=c(1,2))

# Individuals on dimensions 1 and 2

fviz_pca_ind(res.pca,

axes=c(1,2))



# Show variable points and text labels

fviz_pca_var(res.pca,

geom.var=c("point", "text"))



# Show individuals text labels only

fviz_pca_ind(res.pca,

geom.ind="text")



# Change the size of arrows an labels

fviz_pca_var(res.pca,

arrowsize=1,

labelsize=5,

repel=TRUE)



# Change points size, shape and fill color

# Change labelsize

fviz_pca_ind(res.pca,

pointsize=3,

pointshape=21,

fill="lightblue",

labelsize=5,

repel=TRUE)



fviz_pca_var(res.pca,

axes.linetype="blank")



fviz_pca_biplot(res.pca,

repel = TRUE,

col.var="#2E9FDF",

col.ind="#696969")





###4 Print/Save the PCA Results



## Print the plot to a pdf file

pdf("myplot.pdf")

print(myplot)

dev.off()



scree.plot <- fviz_eig(res.pca) # Scree plot

ind.plot <- fviz_pca_ind(res.pca) # Plot of individuals

var.plot <- fviz_pca_var(res.pca) # Plot of variables

pdf("PCA.pdf") # Create a new pdf device

print(scree.plot)

print(ind.plot)

print(var.plot)

dev.off() # Close the pdf device



png("pca-scree-plot.png")

print(scree.plot)

dev.off()



png("pca-variables.png")

print(var.plot)

dev.off()



png("pca-individuals.png")

print(ind.plot)

dev.off()



## 'ggpubr' provides some easy-to-use functions for creating

# and customizing 'ggplot2'- based publication ready plots.

library(ggpubr)

ggexport(plotlist=list(scree.plot, ind.plot, var.plot),

filename="PCA.pdf")



ggexport(plotlist=list(scree.plot, ind.plot, var.plot),

nrow=2,

ncol=2,

filename="PCA.pdf")



ggexport(plotlist=list(scree.plot, ind.plot, var.plot),

filename="PCA.png")





###참고: Perform Factorial Analysis from 'FactoMineR'

# with a Shiny Application

library(Factoshiny)

data(decathlon)

res.shiny <- PCAshiny(decathlon)







#####################################################

# Pre-Season 1985 College Basketball Rankings #

#####################################################

text <- "

Louisville 1 8 1 9 8 9 6 10 9 9

Georgia Tech 2 2 4 3 1 1 1 2 1 1

Kansas 3 4 5 1 5 11 8 4 5 7

Michigan 4 5 9 4 2 5 3 1 3 2

Duke 5 6 7 5 4 10 4 5 6 5

UNC 6 1 2 2 3 4 2 3 2 3

Syracuse 7 10 6 11 6 6 5 6 4 10

Notre Dame 8 14 15 13 11 20 18 13 12 .

Kentucky 9 15 16 14 14 19 11 12 11 13

LSU 10 9 13 . 13 15 16 9 14 8

DePaul 11 . 21 15 20 . 19 . . 19

Georgetown 12 7 8 6 9 2 9 8 8 4

Navy 13 20 23 10 18 13 15 . 20 .

Illinois 14 3 3 7 7 3 10 7 7 6

Iowa 15 16 . . 23 . . 14 . 20

Arkansas 16 . . . 25 . . . . 16

Memphis State 17 . 11 . 16 8 20 . 15 12

Washington 18 . . . . . . 17 . .

UAB 19 13 10 . 12 17 . 16 16 15

UNLV 20 18 18 19 22 . 14 18 18 .

NC State 21 17 14 16 15 . 12 15 17 18

Maryland 22 . . . 19 . . . 19 14

Pittsburgh 23 . . . . . . . . .

Oklahoma 24 19 17 17 17 12 17 . 13 17

Indiana 25 12 20 18 21 . . . . .

Virginia 26 . 22 . . 18 . . . .

Old Dominion 27 . . . . . . . . .

Auburn 28 11 12 8 10 7 7 11 10 11

St. Johns 29 . . . . 14 . . . .

UCLA 30 . . . . . . 19 . .

St. Joseph's . . 19 . . . . . . .

Tennessee . . 24 . . 16 . . . .

Montana . . . 20 . . . . . .

Houston . . . . 24 . . . . .

Virginia Tech . . . . . . 13 . . ."



data2 <- read.fwf(textConnection(text),

widths = c(13,rep(3,10)),

header = FALSE,

na.strings=" .",

skip=1)

Xdata <- data2[,-1]

rownames(Xdata) <- data2[,1]

colnames(Xdata) <- paste("x",sep="",1:10)

rm(data2)

News_names <- c("Community Sports News",

"Durham Sun",

"Durham Morning Herald",

"Washington Post",

"USA Today",

"Sports Magazine",

"Inside Sports",

"United Press International",

"Associated Press",

"Sports Illustrated")

Xdata

max_ranks <- apply(Xdata,2,max,na.rm=T)

imputed_ranks <- max_ranks + (36-max_ranks)/2

imputed_ranks[4] <- mean(c(12,21:35))

weights <- 10-apply(apply(Xdata,2,is.na),1,sum)

for(j in 1:10) {

for(i in 1:35) {

if(is.na(Xdata[i,j])) Xdata[i,j] <- imputed_ranks[j]

}

}



X <- Xdata

summary(X)



round(apply(X,2,mean),digits=2)

round(apply(X,2,var), digits=2)

Y <- scale(X,center=T,scale=F)



prcomp_out <- prcomp(Y)

str(prcomp_out)



princomp_out <- princomp(Y)

str(princomp_out)



out.PCA <- PCA(Y,

scale.unit=FALSE,

ncp=3,

row.w=weights,

graph=TRUE)

out.PCA

str(out.PCA)



get_eigenvalue(out.PCA)

fviz_eig(out.PCA,addlabels=TRUE,ylim=c(0,100))

fviz_pca_var(out.PCA)

fviz_pca_ind(out.PCA,repel=TRUE)

score <- sort(out.PCA$ind$coord[,1])

plot(score)







install.packages('psych')

library(psych)

help(package="psych")



cor.wt_X_out <- cor.wt(X,w=weights)

str(cor.wt_X_out)

out <- principal(cor.wt_X_out$r,nfactors=1)

str(out)

out$Vaccounted

str(out$loadings)

out_score_X <- predict(out, X)

str(out_score_X)

names(out_score_X) <- rownames(X)

out_score_X <- sort(out_score_X)

plot(out_score_X)