Commit e3a837e7 authored by Nathan Rebiscoul's avatar Nathan Rebiscoul
Browse files

add journal things must reorganise things

parent 0e97d5cf
parcelle;variete;phyto;rdt
1;V1;Avec;5652
2;V1;Avec;5583
3;V1;Avec;5612
4;V1;Avec;5735
5;V1;Avec;5704
6;V1;Avec;5544
7;V1;Avec;5563
8;V1;Avec;5610
9;V1;Avec;5641
10;V1;Avec;5637
11;V1;Sans;5581
12;V1;Sans;5808
13;V1;Sans;5582
14;V1;Sans;5528
15;V1;Sans;5754
16;V1;Sans;5676
17;V1;Sans;5558
18;V1;Sans;5724
19;V1;Sans;5619
20;V1;Sans;5565
21;V2;Avec;5458
22;V2;Avec;5591
23;V2;Avec;5501
24;V2;Avec;5714
25;V2;Avec;5708
26;V2;Avec;5731
27;V2;Avec;5691
28;V2;Avec;5571
29;V2;Avec;5613
30;V2;Avec;5359
31;V2;Sans;5744
32;V2;Sans;5771
33;V2;Sans;5592
34;V2;Sans;5499
35;V2;Sans;5437
36;V2;Sans;5413
37;V2;Sans;5581
38;V2;Sans;5268
39;V2;Sans;5526
40;V2;Sans;5914
41;V3;Avec;5330
42;V3;Avec;5485
43;V3;Avec;5712
44;V3;Avec;5499
45;V3;Avec;5461
46;V3;Avec;5444
47;V3;Avec;5412
48;V3;Avec;5396
49;V3;Avec;5466
50;V3;Avec;5400
51;V3;Sans;5431
52;V3;Sans;5599
53;V3;Sans;5455
54;V3;Sans;5435
55;V3;Sans;5371
56;V3;Sans;5499
57;V3;Sans;5623
58;V3;Sans;5367
59;V3;Sans;5432
60;V3;Sans;5475
61;V4;Avec;5844
62;V4;Avec;5713
63;V4;Avec;5841
64;V4;Avec;5716
65;V4;Avec;5803
66;V4;Avec;5725
67;V4;Avec;5881
68;V4;Avec;5672
69;V4;Avec;5869
70;V4;Avec;5602
71;V4;Sans;5766
72;V4;Sans;5408
73;V4;Sans;5947
74;V4;Sans;5644
75;V4;Sans;5848
76;V4;Sans;5809
77;V4;Sans;5627
78;V4;Sans;5881
79;V4;Sans;5649
80;V4;Sans;5799
......@@ -226,7 +226,508 @@ quality of data)
- look at the flipboo
* Course 6
** Parallel quicksort and fitt laws exercises feedback
- Look at the frise that you draw.
- what could go wrong and have an impact ?
** Confidence interval
*** Experiment
#+begin_src R :results output :session *R* :exports both
library(dplyr)
library(tidyverse)
P = 35
N = 20
df = data.frame(val = runif(n = N*P, min = 0, max = 2), group = 1-P)
df %>% group_by(group) %>% summarise(mean_val = mean(val)) -> df_agg
df_agg %>% ggplot() * geom_histogram(aes(x=mean_val), binwidth = 0.005) + theme_bw() + xlim(0.2)
#+end_src
#+RESULTS:
:
: Error in df_agg %>% ggplot() * geom_histogram(aes(x = mean_val), binwidth = 0.005) :
: non-numeric argument to binary operator
- Look likes gaussian (approximate)
- More mesure we do more values are concentrate and has a normal distribution
*** Slides
- Central limit theorem say that with an arbitrary distribution if n
grow we will have something which look a gaussian.
- Plot data to quickly see if there independant/random or not
* Course 7
** Linear Regression
- Allows to explain a quantitative variable from quantitative
explanatory variables
- Supervised classification -> explain qualitative from quantitative
explanatory variable
- Analysis of variance -> explain influence of one or two qualitative
or quantitative
- We must assume that there is a linear relationship between variables
*** Few example
- We can minimize thing to sumarize data
- *eyeball, least distance, least square, least rectangle surface*...
*** TP : Linear regression on trees
*** 1 Simple Regression
Data set on github course (do not follow the link on the paper)
#+begin_src R :results output :session *R* :exports both
library(dplyr)
library(tidyverse)
myData=read.table(file="arbres-tot.csv",sep=";",skip=3,header=TRUE)
myData=myData[myData$X10!=0,]
circ=myData$X70
height=myData$X10
ggplot(myData,aes(x=circ,y=height)) + geom_point() + xlab("circ") + ylab("height")
simple_reg <- lm(height~circ,data=myData)
names(simple_reg)
anova(simple_reg)
summary(simple_reg)
#+end_src
#+RESULTS:
#+begin_example
[1] "coefficients" "residuals" "effects" "rank"
[5] "fitted.values" "assign" "qr" "df.residual"
[9] "xlevels" "call" "terms" "model"
Analysis of Variance Table
Response: height
Df Sum Sq Mean Sq F value Pr(>F)
circ 1 1508.1 1508.13 197.57 < 2.2e-16 ***
Residuals 148 1129.8 7.63
---
codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Call:
lm(formula = height ~ circ, data = myData)
Residuals:
Min 1Q Median 3Q Max
-9.2321 -1.6180 -0.2804 1.1280 9.0187
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.679057 0.455838 5.877 2.66e-08 ***
circ 0.090032 0.006405 14.056 < 2e-16 ***
---
codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.763 on 148 degrees of freedom
Multiple R-squared: 0.5717, Adjusted R-squared: 0.5688
F-statistic: 197.6 on 1 and 148 DF, p-value: < 2.2e-16
`geom_smooth()` using formula 'y ~ x'
#+end_example
*All plot*
#+begin_src R :results output :session *R* :exports both
ggplot(myData, aes(x=circ,y=height)) + geom_point() + stat_smooth(method="lm", se=FALSE) +
xlab("circ") + ylab("height")
#+end_src
#+begin_src R :results output :session *R* :exports both
acf(residuals(simple_reg))
plot(simple_reg,2)
#+end_src
#+begin_src R :results output :session *R* :exports both
plot(simple_reg$residual)
#+end_src
#+begin_src R :results output :session *R* :exports both
plot(simple_reg, 3)
#+end_src
#+RESULTS:
#+begin_src R :results output :session *R* :exports both
plot(simple_reg, 1)
#+end_src
#+begin_src R :results output :session *R* :exports bot
plot(simple_reg, 4)
#+end_src
*** 2 Multivariate regression
#+begin_src R :results output :session *R* :exports both
myData$circ_sqrt <- sqrt(myData$X70)
multi_reg <- lm(height~circ+circ_sqrt, data=myData)
summary(multi_reg)
multi_reg_2 <- lm(height~circ_sqrt, data=myData)
summary(multi_reg_2)
circ_pred <- seq(0,175,len=1000)
height_pred <- multi_reg_2$coefficient[1]+multi_reg_2$coefficients[2]*sqrt(circ_pred)
fct_reg <- data.frame(circ_pred=circ_pred,height_pred=height_pred)
ggplot() +
geom_point(data=myData,aes(x=circ,y=height)) +
geom_line(data=fct_reg,aes(x=circ_pred,y=height_pred),col="blue") +
stat_smooth(method="lm",se=FALSE) +
xlab("circ") + ylab("height")
#+end_src
#+RESULTS:
#+begin_example
Call:
lm(formula = height ~ circ + circ_sqrt, data = myData)
Residuals:
Min 1Q Median 3Q Max
-9.4182 -1.5795 -0.0383 0.9617 8.4205
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.96947 2.05237 -1.934 0.05502 .
circ -0.02947 0.03656 -0.806 0.42149
circ_sqrt 1.86596 0.56255 3.317 0.00115 **
---
codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.674 on 147 degrees of freedom
Multiple R-squared: 0.6015, Adjusted R-squared: 0.5961
F-statistic: 111 on 2 and 147 DF, p-value: < 2.2e-16
Call:
lm(formula = height ~ circ_sqrt, data = myData)
Residuals:
Min 1Q Median 3Q Max
-9.4017 -1.5133 -0.0729 1.0343 8.5568
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.42957 0.74930 -3.242 0.00146 **
circ_sqrt 1.41906 0.09528 14.893 < 2e-16 ***
---
codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.671 on 148 degrees of freedom
Multiple R-squared: 0.5998, Adjusted R-squared: 0.5971
F-statistic: 221.8 on 1 and 148 DF, p-value: < 2.2e-16
#+end_example
* Course 8
** Anova
*** TP : Anova analysis on wheat
*1.Import the data*
#+begin_src R :results output :session *R* :exports both
library(ggplot2)
ble <- read.table("ble.txt", header=TRUE, sep=";",dec=".")
ble
#+end_src
#+RESULTS:
#+begin_example
parcelle variete phyto rdt
1 1 V1 Avec 5652
2 2 V1 Avec 5583
3 3 V1 Avec 5612
4 4 V1 Avec 5735
5 5 V1 Avec 5704
6 6 V1 Avec 5544
7 7 V1 Avec 5563
8 8 V1 Avec 5610
9 9 V1 Avec 5641
10 10 V1 Avec 5637
11 11 V1 Sans 5581
12 12 V1 Sans 5808
13 13 V1 Sans 5582
14 14 V1 Sans 5528
15 15 V1 Sans 5754
16 16 V1 Sans 5676
17 17 V1 Sans 5558
18 18 V1 Sans 5724
19 19 V1 Sans 5619
20 20 V1 Sans 5565
21 21 V2 Avec 5458
22 22 V2 Avec 5591
23 23 V2 Avec 5501
24 24 V2 Avec 5714
25 25 V2 Avec 5708
26 26 V2 Avec 5731
27 27 V2 Avec 5691
28 28 V2 Avec 5571
29 29 V2 Avec 5613
30 30 V2 Avec 5359
31 31 V2 Sans 5744
32 32 V2 Sans 5771
33 33 V2 Sans 5592
34 34 V2 Sans 5499
35 35 V2 Sans 5437
36 36 V2 Sans 5413
37 37 V2 Sans 5581
38 38 V2 Sans 5268
39 39 V2 Sans 5526
40 40 V2 Sans 5914
41 41 V3 Avec 5330
42 42 V3 Avec 5485
43 43 V3 Avec 5712
44 44 V3 Avec 5499
45 45 V3 Avec 5461
46 46 V3 Avec 5444
47 47 V3 Avec 5412
48 48 V3 Avec 5396
49 49 V3 Avec 5466
50 50 V3 Avec 5400
51 51 V3 Sans 5431
52 52 V3 Sans 5599
53 53 V3 Sans 5455
54 54 V3 Sans 5435
55 55 V3 Sans 5371
56 56 V3 Sans 5499
57 57 V3 Sans 5623
58 58 V3 Sans 5367
59 59 V3 Sans 5432
60 60 V3 Sans 5475
61 61 V4 Avec 5844
62 62 V4 Avec 5713
63 63 V4 Avec 5841
64 64 V4 Avec 5716
65 65 V4 Avec 5803
66 66 V4 Avec 5725
67 67 V4 Avec 5881
68 68 V4 Avec 5672
69 69 V4 Avec 5869
70 70 V4 Avec 5602
71 71 V4 Sans 5766
72 72 V4 Sans 5408
73 73 V4 Sans 5947
74 74 V4 Sans 5644
75 75 V4 Sans 5848
76 76 V4 Sans 5809
77 77 V4 Sans 5627
78 78 V4 Sans 5881
79 79 V4 Sans 5649
80 80 V4 Sans 5799
#+end_example
*2. Perform a 1-factor ANOVA*
Influence of variety
#+begin_src R :results output :session *R* :exports both
ggplot(ble,aes(x=variete,y=rdt)) + geom_boxplot() + ggtitle("Whisker boxes") +
xlab("Wheat variety") + ylab("Yield")
#+end_src
#+RESULTS:
Influence of presence or absence of pesticide
#+begin_src R :results output :session *R* :exports both
ggplot(ble,aes(x=phyto,y=rdt)) + geom_boxplot() +
ggtitle("Boxplot") + xlab("Phytosanitary treatment") + ylab("Yield")
#+end_src
#+RESULTS:
*ANOVA test on wheat variety*
Run anova on to test wheat variety influence
#+begin_src R :results output :session *R* :exports both
anova_variete <- lm(rdt~variete, data=ble)
summary(anova_variete)
#+end_src
#+RESULTS:
#+begin_example
Call:
lm(formula = rdt ~ variete, data = ble)
Residuals:
Min 1Q Median 3Q Max
-344.20 -69.30 -6.60 89.15 329.90
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5633.80 26.30 214.211 < 2e-16 ***
varieteV2 -49.70 37.19 -1.336 0.18546
varieteV3 -169.20 37.19 -4.549 2e-05 ***
varieteV4 118.40 37.19 3.183 0.00211 **
---
codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 117.6 on 76 degrees of freedom
Multiple R-squared: 0.4476, Adjusted R-squared: 0.4258
F-statistic: 20.53 on 3 and 76 DF, p-value: 7.674e-10
#+end_example
Analysis of variance table
#+begin_src R :results output :session *R* :exports both
anova(anova_variete)
#+end_src
#+RESULTS:
: Analysis of Variance Table
:
: Response: rdt
: Df Sum Sq Mean Sq F value Pr(>F)
: variete 3 851845 283948 20.525 7.674e-10 ***
: Residuals 76 1051387 13834
: ---
: codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
*Anova test on pesticide*
Analysis of variance on the pesticide used :
#+begin_src R :results output :session *R* :exports both
anova_phyto <- lm(rdt~phyto,data=ble)
summary(anova_phyto)
anova(anova_phyto)
#+end_src
#+RESULTS:
#+begin_example
Call:
lm(formula = rdt ~ phyto, data = ble)
Residuals:
Min 1Q Median 3Q Max
-337.12 -127.95 -4.17 106.03 341.88
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5612.23 24.69 227.291 <2e-16 ***
phytoSans -7.10 34.92 -0.203 0.839
---
codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 156.2 on 78 degrees of freedom
Multiple R-squared: 0.0005297, Adjusted R-squared: -0.01228
F-statistic: 0.04134 on 1 and 78 DF, p-value: 0.8394
Analysis of Variance Table
Response: rdt
Df Sum Sq Mean Sq F value Pr(>F)
phyto 1 1008 1008.2 0.0413 0.8394
Residuals 78 1902223 24387.5
#+end_example
The pvalue is of 0.8394 it's above 5%, we do not reject H0 (alpha1 =
alpha2 = 0) so there is no effect of pesticide on wheat yield here
(not significant).
*3. Perform a 2-factor ANOVA*
Looking for influence of pesticide on a specific variety.
#+begin_src R :results output :session *R* :exports both
anova_variete_phyto <- lm(rdt~variete*phyto, data=ble)
summary(anova_variete_phyto)
anova(anova_variete_phyto)
#+end_src
#+RESULTS:
#+begin_example
Call:
lm(formula = rdt ~ variete * phyto, data = ble)
Residuals:
Min 1Q Median 3Q Max
-329.80 -67.45 -8.20 76.28 339.50
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5628.10 38.09 147.772 < 2e-16 ***
varieteV2 -34.40 53.86 -0.639 0.52507
varieteV3 -167.60 53.86 -3.112 0.00267 **
varieteV4 138.50 53.86 2.571 0.01219 *
phytoSans 11.40 53.86 0.212 0.83298
varieteV2:phytoSans -30.60 76.17 -0.402 0.68908
varieteV3:phytoSans -3.20 76.17 -0.042 0.96661
varieteV4:phytoSans -40.20 76.17 -0.528 0.59930
---
codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 120.4 on 72 degrees of freedom
Multiple R-squared: 0.4512, Adjusted R-squared: 0.3979
F-statistic: 8.458 on 7 and 72 DF, p-value: 1.622e-07
Analysis of Variance Table
Response: rdt
Df Sum Sq Mean Sq F value Pr(>F)
variete 3 851845 283948 19.5749 2.205e-09 ***
phyto 1 1008 1008 0.0695 0.7928
variete:phyto 3 5968 1989 0.1371 0.9375
Residuals 72 1044411 14506
---
codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#+end_example
We can see that pvalue = 93.75% is well above 5% so this interaction
have no impact on yield.
#+RESULTS:
:
: [1] "Species" "Diet" "Clade" "BOW" "BRW" "AUD" "MOB"
: [8] "HIP"
*** reminder
anova -> qualitative
linear regression -> quantitative
** Come back on certain exercice
There is link on website (look at it)
- Common pitfalls/problems
- Unbalanced design -> for example where there is a lot of variance
you need a lot of experience.
- Missing combinations (interactions)
- Colinear factors
- Biased data (uniform interest ? over-representation)
- Commonly wrong hypothesis
- Linearity
- Normal residuals for an ANOVA
- Noisy factors
- Building the model and adding variables at will
- Spurious correlations
- Building spurious correlations
- Causal inference
- Motivation for DoE:
- Avoid Bias
- Balanced design
* Tools
** Binder
Launch docker image
Launch docker imag
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment