#Graduate Project - GLM, Count Data and the Poisson Regression

#For my graduate (doctoral) project, I will be measuring the signaling (clicking) count
#data of the treehopper species Umbonia crassicronis under various conditions and 
#contexts and levels of perturbation.

#GLM/MULTIPLE REGRESSION (also see Biostats 380-410)
#The response variable (dependent) will be number of clicks (Y). The first 
#independent variable will be time (X1), in seconds. The second independent variable
#will be the factor Plant ID (X2), with levels (plants) A, B, C, D, F, H, J and K.
#The third independent variable will be factor sex (X3), with levels M and F. The fourth
#independent variable will be factor level of perturbation (X4), with factors light, 
#medium and heavy. The final independent variable will be date of recording (X5).

data1=read.table("Preliminary Click Data.txt", header=T)
data1
attach(data1)

Y=click_number
X1=time
X2=factor(plant_ID)
X3=factor(sex)
X4=factor(stimulus_strength)
X5=factor(date)

FM=lm(Y~X1+X2+X3+X4+X5)
FM

summary(FM)

anova(FM)

#After creating the linear model, the summary() function give the p-values for each 
#independent variable as well as an overall p-value, F stat and regression coefficient
#Shown by the low p-value and high regression coefficent, we can see that there is an
#overall relationship between Y and at least one of the X's. Therefore, we reject the
#null and accept the alternative hypothesis that at least one beta is not equal to 0

#Now lets compare our full model to reduced models to see which X's show the association
#with Y.

step(FM, direction="backward")

#shown by the step() function and AIC values, we can see that the optimal model to use
#is a Reduced Model with only X1, X3 and X5 as independent variables. In other words,
#the variables of plant ID and stimulus strength show no relationship with click count,
#and the betas of these X's are not significantly different from 0 (hopefully this wont
#be the case when i have usable data!) Since our reduced model is optimal, lets assign
#it to the variable RM.

RM=lm(Y~X1+X3+X5)
RM

#Now lets compare the FM and the RM with the anova() function

anova(FM,RM)

#Shown by the high p-value for Model 2, we cannot reject the null. This means that
#the more parsimonious (and thus optimal) model is the reduced model which excludes
#X2 (plant ID) and X4 (stimulus strength).

#Now to individually compare our 3 favored variables to Y, lets make individual linear
#models for them
LM1=lm(Y~X1)
LM3=lm(Y~X3)
LM5=lm(Y~X5)



#Here's how to make dataframes of the values of Yhat and e for the 3 favored variables (RM). 

Yhat1=fitted(LM1)         #---------> Yhat = line of best fit 
Yhat3=fitted(LM3)        
Yhat5=fitted(LM5)

e1=residuals(LM1)         #-------> error
e3=residuals(LM3)
e5=residuals(LM5)

RESULTS1=data.frame(X1,Y,Yhat1,e1)
RESULTS1

RESULTS3=data.frame(X3,Y,Yhat3,e3)
RESULTS3

RESULTS5=data.frame(X5,Y,Yhat5,e5)
RESULTS5


#Heres how to plot the regression plots for the 3 favored variables (RM) compared to Y
plot(X1,Y, main="Total Number of Clicks Over Time", xlab="Time (in seconds)", ylab="Number of Clicks")
abline(LM1,col="orange")
segments(X1,Yhat1,X1,Y,col="blue")

#Shown by Graph #1, the number of clicks was greater with longer recordings, as expected




plot(X3,Y, main="Number of Clicks for Males and Females", xlab="Sex", ylab="Number of Clicks")

#Graph #2 shows that females clicked more than males




plot(X5,Y, main="Number of Clicks on Different Days", xlab="Date", ylab="Number of Clicks")

#Shown by Graph #3, the number of clicks varied on different days

detach(data1)

#-----------------------------------------------------------------------------------

#GLM 040 - Poisson Regression for Count Data

#Poisson Regression and related Negative Binomial Regression, are generalized linear 
#models typically involving count data as the response variable. As such, non-negative
#values are not expected (nor allowed), and responses are considered to be integers. 
#The distribution of response values, as in a histogram for each set of independent 
#variables indicated by index i, may look quite different for small versus large expected 
#values (means). For the Poisson distribution, the distribution around small expected
#values are highly asymmetric, whereas for larger values symmetric. In addition, variance
#increases with expected value. 

#Modeling with a Poisson Regression will be more accurate for this type of count data.
#Lets reassign both the full and reduced model here:

library(nlme)
library(MASS)

FM1=glm(Y~X1*X2*X3*X4*X5, family=poisson)
summary(FM1)

RM1=glm(Y~X1*X3*X5, family=poisson)
summary(RM1)

anova(FM1,RM1, test="chisq")

#As these procedures are beyond the scope of this class, next semester's seminar should
#clarify these approaches so I can use them with my real data

#to be continued....