knitr::opts_chunk$set(echo = TRUE)

0.0.1 Points to remember:

There are 3 ways to test the data normality:

  1. Using histogram, if it is bell-shaped then data are following a normal distribution.

  2. Create a Q-Q plot (if the points in the plot roughly fall along a straight diagonal line, then the data is assumed to be normally distributed.

  3. Formal Statistical Test using the Shapiro-Wilk Test: If the P-value is greater than 0.05 then the data is assumed to be normally distributed.


Skewness is the measure of symmetry for a distribution.

0.0.2 Harvest date (HvD): transformed with orderNorm function

# Upload libraries
library(EnvStats) #for skewness
library(forecast) # for BoxCox function
library(bestNormalize)

# Read phenotypic data
MyTraits <- read.table("Phenotypic_data.tab", header=T, sep = "\t")

# Check Normality 
shapiro.test(MyTraits$Harvest_mean); #NO
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$Harvest_mean
## W = 0.96152, p-value = 0.009787
# Check skweness
skewness(MyTraits$Harvest_mean)
## [1] NA
# Graphic representation of data distribution
par(mfrow=c(2,2))
hist(MyTraits$Harvest_mean,col='steelblue', main='Harvest, p-val= 0.009')

#qqplot
qqnorm(MyTraits$Harvest_mean, main='Harvest')
qqline(MyTraits$Harvest_mean)

#outliers before transformation
boxplot_HRD <- boxplot(MyTraits$Harvest_mean,col='steelblue')
outliers <- boxplot_HRD$out; outliers #No

# transform data with bestNormalize (use function orderNorm)
BN_obj <- bestNormalize(MyTraits$Harvest_mean)
T.HvD <- orderNorm(MyTraits$Harvest_mean);shapiro.test(T.HvD$x.t)
hist(T.HvD$x.t,col='steelblue', main='T.HvD, p-val= 1')

0.0.3 Fruit weight (FW): transformed with square function

# Check Normality
shapiro.test(MyTraits$FW_mean); #NO
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$FW_mean
## W = 0.9612, p-value = 0.008784
# Graphic representation of data distribution
par(mfrow=c(2,2))
hist(MyTraits$FW_mean,col='steelblue', main='FW, p-val= 0.0087')

# qqplot
qqnorm(MyTraits$FW_mean, main='FW');qqline(MyTraits$FW_mean)

# skweness
skewness(MyTraits$FW_mean,na.rm = F); # -0.56:data is moderately skewed to the left

# outliers before transformation
boxplot.FW <- boxplot(MyTraits$FW_mean,col='steelblue')
outliers <- boxplot.FW$out; outliers #No

# transform and redo the Shapiro test and redraw the histogram
T.FW=(MyTraits$FW_mean)^2;shapiro.test(T.FW) #p-value = 0.12
hist(T.FW,col='steelblue', main='T.FW, p-val= 0.12')

0.0.4 Soluble solids content (SSC):

# Check Normality
shapiro.test(MyTraits$SSC_mean); #P-value=0.88, normal 
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$SSC_mean
## W = 0.99227, p-value = 0.8819
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$SSC_mean,col='steelblue', main='SSC, p-val= 0.88')

# qqplot
qqnorm(MyTraits$SSC_mean, main='SSC');qqline(MyTraits$SSC_mean)

# Skweness
skewness(MyTraits$SSC_mean)
## [1] -0.05693422
# Outliers
boxplot.SSC <- boxplot(MyTraits$SSC_mean,col='steelblue')

0.0.5 Flesh firmness (FF):

# Normality
shapiro.test(MyTraits$FF_mean); #YES
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$FF_mean
## W = 0.99104, p-value = 0.8043
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$FF_mean,col='steelblue', main='FF, p-val= 0.80')

# qqplot
qqnorm(MyTraits$FF_mean, main='FF');qqline(MyTraits$FF_mean)

# Skweness
skewness(MyTraits$FF_mean)
## [1] 0.06447945
# Outliers
boxplot.FF <- boxplot(MyTraits$FF_mean,col='steelblue')
outliers <- boxplot.FF$out; outliers #0
## numeric(0)

0.0.6 Titratable acidity (TA): transformed with log2

# Normality
shapiro.test(MyTraits$TA_mean); #NO
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$TA_mean
## W = 0.95313, p-value = 0.002626
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$TA_mean,col='steelblue',main='TA, p-val= 0.0026')

# qqplot
qqnorm(MyTraits$TA_mean, main='TA'); qqline(MyTraits$TA_mean)

# Skweness
skewness(MyTraits$TA_mean)
## [1] 0.6837849
# Outliers before transformation
boxplot.TA <- boxplot(MyTraits$TA_mean,col='steelblue')
outliers <- boxplot.TA$out; outliers #6 outliers were not removed as they were biologically relevant
## [1] 0.89 0.86 0.90 0.88 0.38 0.91
# transform and redo the Shapiro test and redraw the histogram
T.TA=log2(MyTraits$TA_mean) 
shapiro.test(T.TA)
## 
##  Shapiro-Wilk normality test
## 
## data:  T.TA
## W = 0.97403, p-value = 0.06827
hist(T.TA,col='steelblue', main='T.TA, p-val= 0.068')

0.0.7 Ripening index (RI):

## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$RI_mean
## W = 0.98643, p-value = 0.4771
## [1] 0.3642668
## [1] 38.12 35.81

0.0.8 Vitamin C (VitC):

## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$VitC_mean
## W = 0.97257, p-value = 0.05375
## [1] 0.2873272
## [1]  3.43 27.79  2.85

0.0.9 Flavonoids (FLVs): Transformed with boxcox function

## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$Flavonoids_mean
## W = 0.93761, p-value = 0.0003108
## [1] 0.7599241
## [1] 66.38
## Best Normalizing transformation with 90 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 1.4444
##  - Box-Cox: 1.5333
##  - Center+scale: 1.6222
##  - Double Reversed Log_b(x+a): 2.1181
##  - Exp(x): 13.0222
##  - Log_b(x+a): 1.4444
##  - orderNorm (ORQ): 1.4111
##  - sqrt(x + a): 1.3889
##  - Yeo-Johnson: 1.5778
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized sqrt(x + a) Transformation with 90 nonmissing obs.:
##  Relevant statistics:
##  - a = 0 
##  - mean (before standardization) = 4.581693 
##  - sd (before standardization) = 1.51386
## 
##  Shapiro-Wilk normality test
## 
## data:  T.Flav$x.t
## W = 0.97952, p-value = 0.1671

0.0.10 Anthocyanins (ACNs): Transformed with boxcox function

# Normality
shapiro.test(MyTraits$Anthocyanins_mean); #no
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$Anthocyanins_mean
## W = 0.73121, p-value = 1.533e-11
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Anthocyanins_mean,col='steelblue', main='Anthocyanins, p-val= 1.53e-11')

#qqplot
qqnorm(MyTraits$Anthocyanins_mean, main='Anthocyanins'); qqline(MyTraits$Anthocyanins_mean)

#Skweness
skewness(MyTraits$Anthocyanins_mean) #2.4
## [1] 2.405975
#Outliers before transformation
boxplot.Anthocyanins <- boxplot(MyTraits$Anthocyanins_mean,col='steelblue')
outliers <- boxplot.Anthocyanins$out; outliers #6
## [1]  6.98  7.86  6.76 10.43 12.04  7.90
# Transformation
Antho.object <- bestNormalize(MyTraits$Anthocyanins_mean)
Box.Antho <- boxcox(MyTraits$Anthocyanins_mean)
T.Anthocyanin <- Box.Antho$x.t

shapiro.test(T.Anthocyanin) #0.12
## 
##  Shapiro-Wilk normality test
## 
## data:  T.Anthocyanin
## W = 0.97741, p-value = 0.1187
hist(T.Anthocyanin, col='steelblue', main='T.Anthocyanin, p-val= 0.12')

0.0.11 Phenolics (Phen): Transformed with yeojohnson function

# Normality
shapiro.test(MyTraits$Phenolics_mean); #no

# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Phenolics_mean,col='steelblue', main='Phenolics, p-val= 2.94e-09')

#qqplot
qqnorm(MyTraits$Phenolics_mean, main='Phenolics'); qqline(MyTraits$Phenolics_mean)

#Skweness
skewness(MyTraits$Phenolics_mean) #-2.11

#Outliers before transformation
boxplot.Phenolics <- boxplot(MyTraits$Phenolics_mean,col='steelblue')
outliers <- boxplot.Phenolics$out; outliers #6

#Transform with bestNormalize and redo the Shapiro test and redraw the histogram
Phenols.Object <- bestNormalize(MyTraits$Phenolics_mean)
Phenols.Object
YEO.Object <- yeojohnson(MyTraits$Phenolics_mean)
T.phenols <- YEO.Object$x.t
shapiro.test(T.phenols) # 0.265
hist(T.phenols, col='steelblue', main='T.phenols, p-val= 0.265')
## Best Normalizing transformation with 90 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 2.8444
##  - Box-Cox: 1.8889
##  - Center+scale: 2.3667
##  - Double Reversed Log_b(x+a): 1.4333
##  - Exp(x): 8.8667
##  - Log_b(x+a): 2.8444
##  - orderNorm (ORQ): 1.3333
##  - sqrt(x + a): 2.5889
##  - Yeo-Johnson: 1.2889
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized Yeo-Johnson Transformation with 90 nonmissing obs.:
##  Estimated statistics:
##  - lambda = 4.56977 
##  - mean (before standardization) = 9132977 
##  - sd (before standardization) = 3895710

0.0.12 Relative antioxidant capacity (RAC):

## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$RAC_mean
## W = 0.97681, p-value = 0.1075
## [1] -0.4879538
## numeric(0)

0.0.13 Sucrose (Suc):

# Normality
shapiro.test(MyTraits$Sucrose_mean); #NO
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$Sucrose_mean
## W = 0.90193, p-value = 4.945e-06
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Sucrose_mean,col='steelblue',  main='Sucrose, p-val= 4.9e-06')

#qqplot
qqnorm(MyTraits$Sucrose_mean, main='Sucrose'); qqline(MyTraits$Sucrose_mean)

#Skweness
skewness(MyTraits$Sucrose_mean)
## [1] -1.306714
#Outliers before transformation
boxplot.Sucrose <- boxplot(MyTraits$Sucrose_mean,col='steelblue')
outliers <- boxplot.Sucrose$out; outliers #6
## [1] 50.47 35.48 53.88 49.24 98.51 40.20
# Sucrose couldn't be transformed with any function

0.0.14 Glucose (Glu):

## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$Glucose_mean
## W = 0.98192, p-value = 0.2449
## [1] 0.2676547
## numeric(0)

0.0.15 Fructose (Fruc): Transformed with square function

# Normality
shapiro.test(MyTraits$Fructose_mean); #No
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$Fructose_mean
## W = 0.94952, p-value = 0.001564
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Fructose_mean,col='steelblue',main='Fructose, p-val= 0.001')

#qqplot
qqnorm(MyTraits$Fructose_mean, main='Fructose'); qqline(MyTraits$Fructose_mean)

#Skweness
skewness(MyTraits$Fructose_mean)
## [1] -0.8691616
#Outliers before transformation
boxplot.Fructose <- boxplot(MyTraits$Fructose_mean,col='steelblue')
outliers <- boxplot.Fructose$out; outliers #2
## [1] 6.37 2.79
# Transform data
T.Fructose <- (MyTraits$Fructose_mean)^2
shapiro.test(T.Fructose) # 0.37
## 
##  Shapiro-Wilk normality test
## 
## data:  T.Fructose
## W = 0.9847, p-value = 0.3737
hist(T.Fructose, col='steelblue', main='T.Fructose, p-val= 0.37')

0.0.16 Sorbitol (Sor): Transformed with cube root

# Normality
shapiro.test(MyTraits$Sorbitol_mean); #No
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$Sorbitol_mean
## W = 0.94427, p-value = 0.0007539
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$Sorbitol_mean,col='steelblue', main='Sorbitol, p-val= 0.0007')

#qqplot
qqnorm(MyTraits$Sorbitol_mean, main='Sorbitol'); qqline(MyTraits$Sorbitol_mean)

#Skweness
skewness(MyTraits$Sorbitol_mean)
## [1] 0.5914254
#outliers before transformation
boxplot.Sorbitol <- boxplot(MyTraits$Sorbitol_mean,col='steelblue')
outliers <- boxplot.Sorbitol$out; outliers #0
## numeric(0)
# Transform data
T.Sorbitol <- (MyTraits$Sorbitol_mean)^(1/3)
shapiro.test(T.Sorbitol) # 0.055
## 
##  Shapiro-Wilk normality test
## 
## data:  T.Sorbitol
## W = 0.97279, p-value = 0.05573
hist(T.Sorbitol, col='steelblue', main='T.Sorbitol, p-val= 0.055')

0.0.17 Total sugar (TS):Transformed with square function

# Check Normality
shapiro.test(MyTraits$TS_mean); #No
## 
##  Shapiro-Wilk normality test
## 
## data:  MyTraits$TS_mean
## W = 0.96627, p-value = 0.01942
# Histogram
par(mfrow=c(2,2))
hist(MyTraits$TS_mean,col='steelblue', main='TS, p-val= 0.02')

#qqplot
qqnorm(MyTraits$TS_mean, main='TS'); qqline(MyTraits$TS_mean)

#Skweness
skewness(MyTraits$TS_mean)
## [1] -0.5784034
#outliers before transformation
boxplot.TS <- boxplot(MyTraits$TS_mean,col='steelblue')
outliers <- boxplot.TS$out; outliers #2
## [1] 68.98 71.45
# Transform data
T.TS <- (MyTraits$TS_mean)^(2)
shapiro.test(T.TS) # 0.34
## 
##  Shapiro-Wilk normality test
## 
## data:  T.TS
## W = 0.98416, p-value = 0.3449
hist(T.TS, col='steelblue', main='T.TS, p-val= 0.34')