knitr::opts_chunk$set(echo = TRUE)
There are 3 ways to test the data normality:
Using histogram, if it is bell-shaped then data are following a normal distribution.
Create a Q-Q plot (if the points in the plot roughly fall along a straight diagonal line, then the data is assumed to be normally distributed.
Formal Statistical Test using the Shapiro-Wilk Test: If the P-value is greater than 0.05 then the data is assumed to be normally distributed.
Skewness is the measure of symmetry for a distribution.
# Upload libraries
library(EnvStats) #for skewness
library(forecast) # for BoxCox function
library(bestNormalize)
# Read phenotypic data
MyTraits <- read.table("Phenotypic_data.tab", header=T, sep = "\t")
# Check Normality
shapiro.test(MyTraits$Harvest_mean); #NO
##
## Shapiro-Wilk normality test
##
## data: MyTraits$Harvest_mean
## W = 0.96152, p-value = 0.009787
# Check skweness
skewness(MyTraits$Harvest_mean)
## [1] NA
# Graphic representation of data distribution
par(mfrow=c(2,2))
hist(MyTraits$Harvest_mean,col='steelblue', main='Harvest, p-val= 0.009')
#qqplot
qqnorm(MyTraits$Harvest_mean, main='Harvest')
qqline(MyTraits$Harvest_mean)
#outliers before transformation
boxplot_HRD <- boxplot(MyTraits$Harvest_mean,col='steelblue')
outliers <- boxplot_HRD$out; outliers #No
# transform data with bestNormalize (use function orderNorm)
BN_obj <- bestNormalize(MyTraits$Harvest_mean)
T.HvD <- orderNorm(MyTraits$Harvest_mean);shapiro.test(T.HvD$x.t)
hist(T.HvD$x.t,col='steelblue', main='T.HvD, p-val= 1')
# Check Normality
shapiro.test(MyTraits$FW_mean); #NO
##
## Shapiro-Wilk normality test
##
## data: MyTraits$FW_mean
## W = 0.9612, p-value = 0.008784
# Graphic representation of data distribution
par(mfrow=c(2,2))
hist(MyTraits$FW_mean,col='steelblue', main='FW, p-val= 0.0087')
# qqplot
qqnorm(MyTraits$FW_mean, main='FW');qqline(MyTraits$FW_mean)
# skweness
skewness(MyTraits$FW_mean,na.rm = F); # -0.56:data is moderately skewed to the left
# outliers before transformation
boxplot.FW <- boxplot(MyTraits$FW_mean,col='steelblue')
outliers <- boxplot.FW$out; outliers #No
# transform and redo the Shapiro test and redraw the histogram
T.FW=(MyTraits$FW_mean)^2;shapiro.test(T.FW) #p-value = 0.12
hist(T.FW,col='steelblue', main='T.FW, p-val= 0.12')
# Check Normality
shapiro.test(MyTraits$SSC_mean); #P-value=0.88, normal
##
## Shapiro-Wilk normality test
##
## data: MyTraits$SSC_mean
## W = 0.99227, p-value = 0.8819
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$SSC_mean,col='steelblue', main='SSC, p-val= 0.88')
# qqplot
qqnorm(MyTraits$SSC_mean, main='SSC');qqline(MyTraits$SSC_mean)
# Skweness
skewness(MyTraits$SSC_mean)
## [1] -0.05693422
# Outliers
boxplot.SSC <- boxplot(MyTraits$SSC_mean,col='steelblue')
# Normality
shapiro.test(MyTraits$FF_mean); #YES
##
## Shapiro-Wilk normality test
##
## data: MyTraits$FF_mean
## W = 0.99104, p-value = 0.8043
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$FF_mean,col='steelblue', main='FF, p-val= 0.80')
# qqplot
qqnorm(MyTraits$FF_mean, main='FF');qqline(MyTraits$FF_mean)
# Skweness
skewness(MyTraits$FF_mean)
## [1] 0.06447945
# Outliers
boxplot.FF <- boxplot(MyTraits$FF_mean,col='steelblue')
outliers <- boxplot.FF$out; outliers #0
## numeric(0)
# Normality
shapiro.test(MyTraits$TA_mean); #NO
##
## Shapiro-Wilk normality test
##
## data: MyTraits$TA_mean
## W = 0.95313, p-value = 0.002626
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$TA_mean,col='steelblue',main='TA, p-val= 0.0026')
# qqplot
qqnorm(MyTraits$TA_mean, main='TA'); qqline(MyTraits$TA_mean)
# Skweness
skewness(MyTraits$TA_mean)
## [1] 0.6837849
# Outliers before transformation
boxplot.TA <- boxplot(MyTraits$TA_mean,col='steelblue')
outliers <- boxplot.TA$out; outliers #6 outliers were not removed as they were biologically relevant
## [1] 0.89 0.86 0.90 0.88 0.38 0.91
# transform and redo the Shapiro test and redraw the histogram
T.TA=log2(MyTraits$TA_mean)
shapiro.test(T.TA)
##
## Shapiro-Wilk normality test
##
## data: T.TA
## W = 0.97403, p-value = 0.06827
hist(T.TA,col='steelblue', main='T.TA, p-val= 0.068')
##
## Shapiro-Wilk normality test
##
## data: MyTraits$RI_mean
## W = 0.98643, p-value = 0.4771
## [1] 0.3642668
## [1] 38.12 35.81
##
## Shapiro-Wilk normality test
##
## data: MyTraits$VitC_mean
## W = 0.97257, p-value = 0.05375
## [1] 0.2873272
## [1] 3.43 27.79 2.85
##
## Shapiro-Wilk normality test
##
## data: MyTraits$Flavonoids_mean
## W = 0.93761, p-value = 0.0003108
## [1] 0.7599241
## [1] 66.38
## Best Normalizing transformation with 90 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 1.4444
## - Box-Cox: 1.5333
## - Center+scale: 1.6222
## - Double Reversed Log_b(x+a): 2.1181
## - Exp(x): 13.0222
## - Log_b(x+a): 1.4444
## - orderNorm (ORQ): 1.4111
## - sqrt(x + a): 1.3889
## - Yeo-Johnson: 1.5778
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized sqrt(x + a) Transformation with 90 nonmissing obs.:
## Relevant statistics:
## - a = 0
## - mean (before standardization) = 4.581693
## - sd (before standardization) = 1.51386
##
## Shapiro-Wilk normality test
##
## data: T.Flav$x.t
## W = 0.97952, p-value = 0.1671
# Normality
shapiro.test(MyTraits$Anthocyanins_mean); #no
##
## Shapiro-Wilk normality test
##
## data: MyTraits$Anthocyanins_mean
## W = 0.73121, p-value = 1.533e-11
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Anthocyanins_mean,col='steelblue', main='Anthocyanins, p-val= 1.53e-11')
#qqplot
qqnorm(MyTraits$Anthocyanins_mean, main='Anthocyanins'); qqline(MyTraits$Anthocyanins_mean)
#Skweness
skewness(MyTraits$Anthocyanins_mean) #2.4
## [1] 2.405975
#Outliers before transformation
boxplot.Anthocyanins <- boxplot(MyTraits$Anthocyanins_mean,col='steelblue')
outliers <- boxplot.Anthocyanins$out; outliers #6
## [1] 6.98 7.86 6.76 10.43 12.04 7.90
# Transformation
Antho.object <- bestNormalize(MyTraits$Anthocyanins_mean)
Box.Antho <- boxcox(MyTraits$Anthocyanins_mean)
T.Anthocyanin <- Box.Antho$x.t
shapiro.test(T.Anthocyanin) #0.12
##
## Shapiro-Wilk normality test
##
## data: T.Anthocyanin
## W = 0.97741, p-value = 0.1187
hist(T.Anthocyanin, col='steelblue', main='T.Anthocyanin, p-val= 0.12')
# Normality
shapiro.test(MyTraits$Phenolics_mean); #no
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Phenolics_mean,col='steelblue', main='Phenolics, p-val= 2.94e-09')
#qqplot
qqnorm(MyTraits$Phenolics_mean, main='Phenolics'); qqline(MyTraits$Phenolics_mean)
#Skweness
skewness(MyTraits$Phenolics_mean) #-2.11
#Outliers before transformation
boxplot.Phenolics <- boxplot(MyTraits$Phenolics_mean,col='steelblue')
outliers <- boxplot.Phenolics$out; outliers #6
#Transform with bestNormalize and redo the Shapiro test and redraw the histogram
Phenols.Object <- bestNormalize(MyTraits$Phenolics_mean)
Phenols.Object
YEO.Object <- yeojohnson(MyTraits$Phenolics_mean)
T.phenols <- YEO.Object$x.t
shapiro.test(T.phenols) # 0.265
hist(T.phenols, col='steelblue', main='T.phenols, p-val= 0.265')
## Best Normalizing transformation with 90 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 2.8444
## - Box-Cox: 1.8889
## - Center+scale: 2.3667
## - Double Reversed Log_b(x+a): 1.4333
## - Exp(x): 8.8667
## - Log_b(x+a): 2.8444
## - orderNorm (ORQ): 1.3333
## - sqrt(x + a): 2.5889
## - Yeo-Johnson: 1.2889
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized Yeo-Johnson Transformation with 90 nonmissing obs.:
## Estimated statistics:
## - lambda = 4.56977
## - mean (before standardization) = 9132977
## - sd (before standardization) = 3895710
##
## Shapiro-Wilk normality test
##
## data: MyTraits$RAC_mean
## W = 0.97681, p-value = 0.1075
## [1] -0.4879538
## numeric(0)
# Normality
shapiro.test(MyTraits$Sucrose_mean); #NO
##
## Shapiro-Wilk normality test
##
## data: MyTraits$Sucrose_mean
## W = 0.90193, p-value = 4.945e-06
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Sucrose_mean,col='steelblue', main='Sucrose, p-val= 4.9e-06')
#qqplot
qqnorm(MyTraits$Sucrose_mean, main='Sucrose'); qqline(MyTraits$Sucrose_mean)
#Skweness
skewness(MyTraits$Sucrose_mean)
## [1] -1.306714
#Outliers before transformation
boxplot.Sucrose <- boxplot(MyTraits$Sucrose_mean,col='steelblue')
outliers <- boxplot.Sucrose$out; outliers #6
## [1] 50.47 35.48 53.88 49.24 98.51 40.20
# Sucrose couldn't be transformed with any function
##
## Shapiro-Wilk normality test
##
## data: MyTraits$Glucose_mean
## W = 0.98192, p-value = 0.2449
## [1] 0.2676547
## numeric(0)
# Normality
shapiro.test(MyTraits$Fructose_mean); #No
##
## Shapiro-Wilk normality test
##
## data: MyTraits$Fructose_mean
## W = 0.94952, p-value = 0.001564
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Fructose_mean,col='steelblue',main='Fructose, p-val= 0.001')
#qqplot
qqnorm(MyTraits$Fructose_mean, main='Fructose'); qqline(MyTraits$Fructose_mean)
#Skweness
skewness(MyTraits$Fructose_mean)
## [1] -0.8691616
#Outliers before transformation
boxplot.Fructose <- boxplot(MyTraits$Fructose_mean,col='steelblue')
outliers <- boxplot.Fructose$out; outliers #2
## [1] 6.37 2.79
# Transform data
T.Fructose <- (MyTraits$Fructose_mean)^2
shapiro.test(T.Fructose) # 0.37
##
## Shapiro-Wilk normality test
##
## data: T.Fructose
## W = 0.9847, p-value = 0.3737
hist(T.Fructose, col='steelblue', main='T.Fructose, p-val= 0.37')
# Normality
shapiro.test(MyTraits$Sorbitol_mean); #No
##
## Shapiro-Wilk normality test
##
## data: MyTraits$Sorbitol_mean
## W = 0.94427, p-value = 0.0007539
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Sorbitol_mean,col='steelblue', main='Sorbitol, p-val= 0.0007')
#qqplot
qqnorm(MyTraits$Sorbitol_mean, main='Sorbitol'); qqline(MyTraits$Sorbitol_mean)
#Skweness
skewness(MyTraits$Sorbitol_mean)
## [1] 0.5914254
#outliers before transformation
boxplot.Sorbitol <- boxplot(MyTraits$Sorbitol_mean,col='steelblue')
outliers <- boxplot.Sorbitol$out; outliers #0
## numeric(0)
# Transform data
T.Sorbitol <- (MyTraits$Sorbitol_mean)^(1/3)
shapiro.test(T.Sorbitol) # 0.055
##
## Shapiro-Wilk normality test
##
## data: T.Sorbitol
## W = 0.97279, p-value = 0.05573
hist(T.Sorbitol, col='steelblue', main='T.Sorbitol, p-val= 0.055')
# Check Normality
shapiro.test(MyTraits$TS_mean); #No
##
## Shapiro-Wilk normality test
##
## data: MyTraits$TS_mean
## W = 0.96627, p-value = 0.01942
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$TS_mean,col='steelblue', main='TS, p-val= 0.02')
#qqplot
qqnorm(MyTraits$TS_mean, main='TS'); qqline(MyTraits$TS_mean)
#Skweness
skewness(MyTraits$TS_mean)
## [1] -0.5784034
#outliers before transformation
boxplot.TS <- boxplot(MyTraits$TS_mean,col='steelblue')
outliers <- boxplot.TS$out; outliers #2
## [1] 68.98 71.45
# Transform data
T.TS <- (MyTraits$TS_mean)^(2)
shapiro.test(T.TS) # 0.34
##
## Shapiro-Wilk normality test
##
## data: T.TS
## W = 0.98416, p-value = 0.3449
hist(T.TS, col='steelblue', main='T.TS, p-val= 0.34')