Saturday, December 16, 2017

My lecture on R-Programming

http://geog.uoregon.edu/bartlein/courses/geog495/lec02.html
Data visualisation Class with R. (Lecture - 1)
This course is only Psychology Researchers

MTCARS


data("mtcars")



### TASK 1:

# Load package "datasets":
library(datasets)

#Load data "mtcars":
data(mtcars)

#Info about dataset:
?mtcars

#Display datasets:
mtcars

#Display only headers:
head(mtcars)

#Display names of variables:
names(mtcars)

#Display labels of observations:
rownames(mtcars)

#Display info about dataset
dim(mtcars)
nrow(mtcars)
ncol(mtcars)

# Basic statistics for variables:
summary(mtcars)

# Other statistics:
mean(mtcars$mpg)
median(mtcars$mpg)
var(mtcars$mpg)
sd(mtcars$mpg)
range(mtcars$mpg)
IQR(mtcars$mpg)
quantile(mtcars$mpg,0.67)

# Correlation and covariance matrix:
cor(mtcars)
cov(mtcars)

# Pairs plot
pairs(mtcars)

# Histogram for variable "mpg":
hist(mtcars$mpg)
# Histogram for variable "mpg" with some graphical options
hist(mtcars$mpg,col="blue",main="Histogram for variable mpg",xlab="Values of mpg",ylab="Frequency")


#Boxplots for variable "mpg"
boxplot(mtcars$mpg)
#Boxplots for variable "mpg" with some graphical options
boxplot(mtcars$mpg,col="grey",main="Boxplot for variable mpg",xlab="Variable mpg",ylab="Boxplot")


#Scatterplot (in library car)
install.packages("car")
library(car)

scatterplot(mtcars$mpg,mtcars$disp)

#TASK 2
setwd("C:\\Documents and Settings\\Pawel\\Moje dokumenty\\Dydaktyka\\ADVANCED_STATISTICAL_METHODS\\Data")

Data = read.csv("daneSoc.csv",sep=";")

#Basic statistics:
summary(Data)

#Basic statistics for selected variables, e.g cisnienie.skurczowe and plec
summary(Data$plec)
summary(Data$cisnienie.skurczowe)

#Contingency table for wyksztalcenie and praca:
table(Data$wyksztalcenie,Data$praca)

#Histogram for wiek:
hist(Data$wiek)

#Empirical cummulative distribution function:
plot(ecdf(Data$wiek))

#Boxplots for wiek in 4 groups determined by variable wyksztalcenie:
boxplot(Data$wiek~Data$wyksztalcenie)

#Scatterplot for cisnienie.skurczowe and cisnienie.rozkurczowe in two groups: men and women:
sp(Data$cisnienie.skurczowe,Data$cisnienie.rozkurczowe,groups=Data$plec)

#Mosaicplot:
library(graphics)
mosaicplot(table(Data$praca,Data$wyksztalcenie,Data$plec))

#Baloon plot:
install.packages("gplots")
library(gplots)
balloonplot(table(Data$plec,Data$wyksztalcenie))

#Pie plot
pie(Data$wyksztalcenie)

#TASK 3

# cisnieni.skurczowe in group of men with secondary education.
summary(Data[(Data$plec=="mezczyzna")&(Data$wyksztalcenie=="srednie"),]$cisnienie.skurczowe)

# cisnieni skurczowe in groups of  employed men and out-of-work men.
x1 = Data[(Data$plec=="mezczyzna")&(Data$praca=="nie pracuje"),]$cisnienie.skurczowe
x2 = Data[(Data$plec=="mezczyzna")&(Data$praca=="uczen lub pracuje"),]$cisnienie.skurczowe
boxplot(x1,x2)

#Select patietns who have secondary education and whose  systolic (cisnienie skurczowe) blood pressure is between 140 and 150.
Data[(Data$cisnienie.skurczowe>=140)&(Data$cisnienie.skurczowe>=150)&(Data$wyksztalcenie=="srednie"),]

#Find the patient (the observation number) with the highest systolic blood pressure.
which.max(Data$cisnienie.skurczowe)

#Find the patient(s) (the observation(s) number(s)) whose  diastolic blood pressure is 107.
which(Data$cisnienie.rozkurczowe==107)

#Find patients (the observation(s) number(s)) whose systolic blood pressure is greater than 0.8 empirical quantile of systolic blood pressure in this data.
q08 = quantile(Data$cisnienie.skurczowe,0.8)
which(Data$cisnienie.skurczowe>q08)

#TASK 4

#Create matrix with 2 columns. The first column contains only 1's whereas the second column contains even numbers staring from 0.
X1 = rep(1,20)
X2 = seq(0,by=2,length.out=20)
X=cbind(X1,X2)


b = c(0.5,2)
v = X %*% b

which(v<mean(v))

# TASK 5

#Normal distribution: Probability density functions with different values of parameters:
x = seq(-3,3,by=0.01)

plot(x,dnorm(x,mean=0,sd=1),col="red",type="l")
lines(x,dnorm(x,mean=1,sd=1),col="blue")
lines(x,dnorm(x,mean=2,sd=1),col="orange")
legend("topleft",c("mean=0","mean=1","mean=2"),col=c("red","blue","orange"),lty=c(1,1,1))

plot(x,dnorm(x,mean=0,sd=1),col="red",type="l")
lines(x,dnorm(x,mean=0,sd=2),col="blue")
lines(x,dnorm(x,mean=0,sd=3),col="orange")
legend("topleft",c("sd=1","sd=2","sd=3"),col=c("red","blue","orange"),lty=c(1,1,1))


#Chi squared distribution with different values of parameters (degrees of freedom):
x = seq(0,8,by=0.01)

plot(x,dchisq(x,df=1),col="red",type="l",ylim=c(0,0.5))
lines(x,dchisq(x,df=2),col="blue")
lines(x,dchisq(x,df=3),col="orange")
legend("topleft",c("df=1","df=2","df=3"),col=c("red","blue","orange"),lty=c(1,1,1))

# TASK 6

# Ok!  Try different sample sizes.
x=rnorm(100,mean=0,sd=1)
qqnorm(x)

# It is seen that in these cases samples are not normal!
x=rgamma(100,shape=1,scale=1)
qqnorm(x)

x=rcauchy(100,scale=1,location=0)
qqnorm(x)




1. Why R?
In my data visualisation classes, I often used graphics through SPSS. SPSS is menu driven. Here as researcher, I find little control on the program. Second graphic output of SPSS is not appealing in the sense it's line thickness.
> x <- c(1,2,3,4,5)
> x
[1] 1 2 3 4 5
> hist x
Here I created one vector or variable named x (small letter) and see the histogram. Sometimes I am confused as which is right ? SPSS or R for same data. For example, for the same vector SPSS and R provides different graphs.
R programming can be written with more than 7000 libraries. It is regularly updated by data scientists all around the world. And all facilities are open access. Finally, there are more number of graphics in R. Furthermore R is interactive as if you are talking with machine. Here is one example.
> x=5+4
> x
[1] 9
>
I read Garette, Guilford. I learnt many formula but when I use SPSS, I feel awkward as it has no utility here as I am acting as machine. I have no freedom to control. But in R, I can control my analysis.

==============================================================================
Data visualisation Class with R. (Lecture - 2)
2. How can I install R?
There are two versions. I prefer R studio. Following are the steps:
Go to https://www.rstudio.com/products/rstudio/download/ In ‘Installers for Supported Platforms’ section, choose and click the R Studio installer based on your operating system.
The download should begin as soon as you click.Click Next..Next..Finish.Download Complete.
To Start R Studio, click on its desktop icon or use ‘search windows’ to access the program. It looks like this:

Choose Your Version of RStudio RStudio is a set of integrated tools designed to help you be more productive with R. It includes a console, syntax-highlighting editor that supports direct code execution, and a variety of robust tools for plotting, viewing history, debugging and

RSTUDIO.COM


======================================================



Data visualisation Class with R. (Lecture - 3)
3. Why R studio?
In data visualisation, it is important to manipulate attributes of the graph. This interactive change is visible through R studio.
RStudio is a free and open-source integrated development environment (IDE) for R, a programming language for statistical computing and graphics. RStudio was founded by JJ Allaire, creator of the programming language ColdFusion. Hadley Wickham is the Chief Scientist at RStudio.
Researchers change attributes in R console and it's results are displayed in graphical output window. Old script is visible in script window and specific r environment is visible in R environment window. R environment window helps in understanding stored variables. For example :
>Iq<- 100
>Iq1<- 120
>ls ( )
>"Iq" "Iq1"
One can remove the variable by
>rm(Iq)
==========================================================================
Data visualisation Class with R. (Lecture - 4)
4. How can I transfer file?
I have noticed that r is more friendly to csv file. CSV stands for "comma-separated values". CSV is a simple file format used to store tabular data, such as a spreadsheet or database. Files in the CSV format can be imported to and exported from programs that store data in tables, such as Microsoft Excel.
So, my suggestion is to convert your spreadsheet with header into csv file by using save as <file name>.csv
Input file
a,b,c
10,20,30
10,20,30
In console, type
my.data = read.table(file.choose(), header=TRUE)
Here file choose command asks you to show the file source.
> my.data
a.b.c
1 10,20,30
2 10,20,30
If you know the source, then write
my.data<-read.csv("C:/Users/DDROY/Desktop/test.csv")
You can remove data file by
>rm(my.data)
N.B.: But the above command is not applicable for R-studio as it imports the file from source. R-studio directly imports file from Excel, SPSS etc.
==========================================================================
Data visualisation Class with R. (Lecture - 5)
2. Is interactive mode possible?
Interactive refers to two-way flow of information between a computer and a computer-user; responding to a user’s input.
R programming is interactive. Here researchers can communicate withcomputer as if friend. Only thing you have to learn it's own language.
It understands statement as command when it finds the sign '>'.
For example
>x=10/5
>x
5
Or
>x=c("Kolkata","Delhi")
>x
Kolkata Delhi
So same variable can hold both numeric and alpha numeric.

========================================================================
Data Visualization class with R (Lesson 7)
7. Scatter Plot
scatter plot (also called a scatter graph, scatter chart, scattergram, or scatter diagram) is a type of plot or mathematical diagram using Cartesian coordinates to display values for typically two variables for a set of data.
Scatter Plot is useful to understand relation of two variables. Usually dataset of explanatory variables are on X-axis and it's changes on dependent variable are on Y-axis.
R is very good for statistics. In the input file, write variable name as header. And in read command, write header = T or TRUE.
> hp<-read.csv("aust.csv",header=T,sep=",")
> hp
Year NSW Vic. Qld SA WA Tas
1 1917 1904 1409 683 44 0 3 6
2 1927 2402 1727 873 56 5 3 92
3 1937 2693 1853 993 58 9 4 57
4 1947 2985 2055 110 6 6 46 502
5 1957 3625 2656 141 3 8 73 688
6 1967 4295 3274 170 0 1 110 87
7 1977 5002 3837 213 0 1 286 12
8 1987 5617 4210 267 5 1 393 14
9 1997 6274 4605 340 1 1 480 17
> plot(NSW~Year, data=hp, pch=16)
>
The option pch=16 sets the plotting character to a solid black dot.

=========================================================================
Data Visualization class with R (Lesson 8, date:14.1017)
8. Histogram
A histogram is a plot that lets you discover, and show, the underlying frequency distribution (shape) of a set of continuous univariate data. This allows the inspection of the data for its underlying distribution (e.g., normal distribution), outliers, skewness, etc.
Initially number of scores in each class interval or bin is counted. Later on plot is prepared. Plot shows binwise frequency of scores.
Histogram plot can be drawn in R with following arguments.
hist(x$v1, main="Anxiety", xlab=" Level", ylab="Number of cases", border ="blue")
Here
x$v1 = data of v1.
main=Graph name
xlab=name of x axis
Border =blue colour of histogram
>hist(x$V1)
>hist(x$V1, main="my histogram")
>hist(x$V1, main="my histogram", xlab="Anxiety")
>hist(x$V1, main="my histogram", xlab="Anxiety", ylab="frequency ")
hist(x$V1, main="my histogram", xlab="Anxiety", ylab="frequency ", border="red")

=========================================================================


Data Visualization class with R (Lesson 9, date:14.1017)
9. Random number
We have read the normal distribution where in mean=0. We have read it but we have not seen it. Today, I will show you that data with Mean=0, And SD=1. But keep in mind ideal condition is Mean=0. This is ideal and it can not be found. Therefore distribution will be close to 0.
This is the example:
> x=rnorm(10,0,1) # 10 is number of data, 0 is Mean, 1 is SD.
> x
[1] -0.8161694 1.4684068 0.2120832 -2.2949087 -0.4389617 1.3511973
[7] -0.9904338 -1.8606424 0.3877817 0.5777760
> mean(x)
[1] -0.2403871
> sd(x)
[1] 1.269368
> x=rnorm(10,0,1)
> x
[1] -1.4317261 0.6816895 -2.8454675 -0.6584917 1.1255525 -1.7652789
[7] -0.7825077 -0.2994345 0.2827423 -0.8911193
> mean(x)
[1] -0.6584041
> sd(x)
[1] 1.18645
>


=======================================================================
Data Visualization class with R (Lesson 10, date:16.10.17)
10. Reading file from any directory.
R wants to read data from directory. Therefore, we initially change the directory or use separate arguments. Here is another argument in which researcher will show the source directory.
>my.data = read.table(file.choose(), header=TRUE)
check the data by this command
>str(my.data)
==========================================================================
Data Visualization class with R (Lesson 11, date:16.10.17)
11. Bar plot & Histogram
Histograms are used to show distributions of variables while bar charts are used to compare variables. Histograms plot quantitative data with ranges of the data grouped into bins or intervals while bar charts plot categorical data
x<-read.csv(file.choose(),header=TRUE)
barplot(x$Anxiety)
> x$Anxiety
[1] 42 32 19 42 30 5 11 8 0 5 3 2

=============================================================================




Data Visualization class with R (Lesson 12 date:16.10.17)
12. Color Bar plot
>barplot(x$anxiety,col="blue", horiz=T, main="health data ", xlab="year")
============================================================================
Data Visualization class with R (Lesson 13 date:17.10.17)
13. line chart
Monthwise anxiety scores
>v=c(7,12,28,3,41)
>plot(v,type="o",col="red",xlab="Month", main="Anxiety scores over months")
Two line charts
t=c(14,7,6,19,3)
> lines(t,type="o",col="blue")

=========================================================================
Data Visualization class with R (Lesson 14 date:21.10.17)
21. Stem-leaf plot (Confusion...do not copy..rather test and show)
A Stem and Leaf Plot is a special table where each data value is split into a "stem" (the first digit or digits) and a "leaf" (usually the last digit). Like in this example:
1 2345
2 2345678
3 12345678912345678
4 234
5 34
53 is divided into two 5 is stem and 3 is leaf
like wise 54
When it is decimal, value after decimal is leaf. For example 2.5,2.6,2.7,1.5,1.9
1 5 9
2 5 6 7
Here is a data set.
> x=c(10,12,22,25,34,22,23)
> stem(x)
The decimal point is 1 digit(s) to the right of the |
1 | 02
1 |
2 | 223
2 | 5
3 | 4
But this result is not meaningful
It should be
1 02
2 2235
3. 4
===========================================================================

Data Visualization class with R (Lesson 15 date:22.10.17)
21. Boxplot
Boxplot is a simple way of representing statistical data on a plot in which a rectangle is drawn to represent the second and third quartiles, usually with a vertical line inside to indicate the median value. The lower and upper quartiles are shown as horizontal lines either side of the rectangle.
>boxplot(x, main="Depression level, High=less depressed", ylab="Scores", xlab="Fig.1. Few people are still depressed")
> boxplot(x, main="Depression level, High=less depressed", ylab="Scores", xlab="Fig.1. Few people are still depressed", col="darkgreen")
============================================================================
Data Visualization class with R (Lesson 16 date:22.10.17)
in class lecture 15, I presented boxplot of single variable. Here is the distribution of more variables.
data are entered into x. So the command is :
> boxplot(x)
Since all variables are not in same scale the distribution is not clear. Therefore, always keep all the variables are on same scale.
Here is the data.frame:
> str(x) # command for data structure
'data.frame': 9 obs. of 7 variables:
$ Year: int 1917 1927 1937 1947 1957 1967 1977 1987 1997
$ NSW : int 1904 2402 2693 2985 3625 4295 5002 5617 6274
$ Vic.: int 1409 1727 1853 2055 2656 3274 3837 4210 4605
$ Qld : int 683 873 993 110 141 170 213 267 340
$ SA : Factor w/ 8 levels "0 1","1 1","3 8",..: 4 6 7 8 3 1 1 5 2
$ WA : Factor w/ 9 levels "0 3","110","286",..: 1 7 9 5 8 2 3 4 6
$ Tas : int 6 92 57 502 688 87 12 14 17
>

No automatic alt text available.


===========================================================================
DData Visualization class with R (Lesson 17 date:22.10.17)
in earlier class lectures, You have understood importance of box-whisker plot developed by John Tucky. I presented boxplot of single and multiple variables. In case of multiple variables, I have told about similar scaling of all variables. In this lecture. I will show you outlier. Outlier affects the central tendency. Outlier may happen for typing mistake or it may be considered as real data. Outlier disturbs the correlation coefficients. Therefore it is important to examine existence of outlier.
No automatic alt text available.

=========================================================================

Data Visualization class with R (Lesson 18 date:30.10.17)
18. Observation location
In case of Uni plot, three things are important.
A. location
B. dispersion
C. distribution
Here is one location of observation
A. Enumerative plots, in which all observations are shown, have the advantage of not losing any specific information–the values of the individual observations can be retrieved from the plot. The disadvantage of such plots arises when there are a large number of observations–it may be difficult to get an overall view of the properties of a variable. Enumerative plots do a fairly good job of displaying the location, dispersion and distribution of a variable, but may not allow a clear comparison of variables, one to another.
Command
> plot(x$RPM)
B. > hist(x$RPM)
C. > stem(x$RPM)
The decimal point is 1 digit(s) to the right of the |
3 | 3
3 | 57
4 | 11122223
4 | 555556667777888899999
5 | 000000122222333333334444
5 | 555555555555556666666666777777888888999
6 | 00
No automatic alt text available.
No automatic alt text available.

===========================================================================
Data visualisation Class with R. (Lecture - 19) 31.1017
19. Data Table
A data frame is used for creating table. Here is a data from different sources and will be merged in single data frame.
Sources
>a=c(2,3,4,5) # storing numeric.
>b=c("Delhi","Kolkata","Chennai","Mumbai") #storing non-numeric.
In above numeric data are stored in c and non-numeric are stored in b.
Command data.frame is used to form new data table. This is stored in d.
>d=data.frame(a,b)
Both a and b data are stored in d as array.
>d
New data table will be displayed.
>head(d) # a and b are hearers.
>d=data.frame(b,a)
> d
        b a
1   Delhi 2
2 Kolkata 3
3 Chennai 4
4  Mumbai 5
> d[,1]
[1] Delhi   Kolkata Chennai Mumbai 
Levels: Chennai Delhi Kolkata Mumbai
names(d)

EXTENDING DATA FRAME
[1] "a" "b"
> f=c("chocolate","cocacola","orange","apple") # new variable f is added
> d=data.frame(b,a,f)
> d
        b a         f
1   Delhi 2 chocolate
2 Kolkata 3  cocacola
3 Chennai 4    orange
4  Mumbai 5     apple


>pie(a,b)
>pie(a,f)
No automatic alt text available.
No automatic alt text available.





https://www.tutorialspoint.com/r/r_pie_charts.htm
===========================================================================
Data visualisation Class with R. (Lecture - 20) 1.11.17
20. Class interval
>score=c(10,15,10,20,20,25,30,20,30,32,40,45,48,50)
> stem(score)
The decimal point is 1 digit(s) to the right of the |
1 | 005
2 | 0005
3 | 002
4 | 058
5 | 0
> summary(score)
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.00 20.00 27.50 28.21 38.00 50.00
> table(score)
score
10 15 20 25 30 32 40 45 48 50
2 1 3 1 2 1 1 1 1 1
> bins=seq(2,50,by=2)
> bins
[1] 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50
>
> v1.cut=cut(v1,bins,right=F)
> v1freq=table(v1.cut)
> v1freq
v1.cut
[2,4) [4,6) [6,8) [8,10) [10,12) [12,14) [14,16) [16,18) [18,20) [20,22)
7 20 40 24 6 0 0 0 0 0
[22,24) [24,26) [26,28) [28,30) [30,32) [32,34) [34,36) [36,38) [38,40) [40,42)
0 0 0 0 0 0 0 0 0 0
[42,44) [44,46) [46,48) [48,50)
0 0 0 0
>
=======================================================================
22. Data visualization with R (lecture note no. 22)
22. Naming and Name calling
When we are born. Our parents give name. And we are getting one title indicating our heredity or root. Like wise we are giving names to variables.
Example : Five people initialized with Deb.
>Deb=c("debi", "debjyoti", "debbani", "debasree")
One can not call Debi as d is small.
Another thing is that you have to call debi with title. So
>Deb$debi
>Deb$debjyoti
=======================================================================
23. Data visualization with R (lecture note no. 23)
23. Array use (storing,retrieving,transforming, frequency table, proportion and histogram)
Arrays are the R data objects which can store data in more than two dimensions. 
> x=c(1,2,3)
> y=x+2
> y
[1] 3 4 5
Read the data from file and store them
>x=read.csv(file.choose(),header=T)
> str(x)
'data.frame': 82 obs. of 34 variables:
$ SL.NO : int 1 2 3 4 5 6 7 8 9 10 ...
$ NAME : Factor w/ 82 levels " DILIP KUMAR ROY",..: 26 37 15 59 34 50 79 67 68 24 ...
$ area : Factor w/ 2 levels "BANSBARI","BHUYANPARA": 2 2 2 2 2 2 2 2 2 2 ...
$ area_code: int 2 2 2 2 2 2 2 2 2 2 ...
$ J1 : int 3 3 1 3 4 5 3 5 4 4 ...
$ J2 : int 4 4 3 4 4 4 4 4 4 4 ...
$ J3 : int 4 4 0 4 4 4 3 5 4 4 ...
$ J4 : int 2 3 0 3 4 4 3 NA 3 4 ...
$ J5 : int 3 3 0 4 3 3 0 4 4 3 ...
$ J6 : int 4 4 2 4 4 5 4 4 4 3 ...
$ J7 : int 4 4 0 4 3 4 4 4 4 4 ...
$ J8 : int 4 4 0 4 0 0 4 0 0 0 ...
$ J9 : int 0 0 0 1 0 0 0 0 0 0 ...
$ J10 : int 5 5 NA 5 0 0 5 5 4 5 ...
$ J11 : int 2 NA 4 3 3 3 3 4 3 3 ...
$ J12 : int 3 1 NA 3 2 3 4 5 4 4 ...
$ J13 : int 3 2 2 3 3 3 0 4 4 3 ...
$ J14 : int 1 0 0 3 1 4 1 4 3 4 ...
$ J15 : int 3 2 0 3 3 3 2 3 3 3 ...
$ J16 : int 0 0 0 0 0 0 0 0 0 0 ...
$ J17 : int 3 0 0 3 0 0 3 0 0 0 ...
$ J18 : int 4 2 4 3 4 4 4 4 4 4 ...
$ J19 : int 4 3 0 3 0 0 1 4 0 0 ...
$ J20 : int 0 0 0 1 0 0 0 4 NA 0 ...
$ J21 : int 3 1 0 3 3 4 0 NA 2 3 ...
$ J22 : int 1 1 0 4 2 2 0 4 3 4 ...
$ J23 : int 2 3 0 2 4 4 0 5 4 4 ...
$ J24 : int 2 3 1 1 3 3 1 4 3 3 ...
$ J25 : int 4 3 0 3 0 0 0 3 3 3 ...
$ J26 : int 2 2 0 3 3 3 0 3 3 2 ...
$ J27 : int 5 4 0 4 3 0 4 4 3 4 ...
$ J28 : int 4 4 3 4 0 0 4 4 0 3 ...
$ J29 : int 4 3 0 4 4 4 4 5 4 4 ...
$ J30 : int 4 4 4 4 3 2 4 4 4 4 ...
> y=table(x$J1)
> y
0 1 2 3 4 5
1 11 4 37 23 5
> (y/sum(y)*100)
0 1 2 3 4
1.234568 13.580247 4.938272 45.679012 28.395062
5
6.172840
>
> z=(y/sum(y)*100)
> z
0 1 2 3 4
1.234568 13.580247 4.938272 45.679012 28.395062
5
6.172840
> barplot(z)

LikeShow More Reactions
Comment
=======================================================================

7.11.17
Data visualization class with R (lecture no. 24, 25, 26)
24. Missing data
25. Table
26. Chisquare
In r, missing data is indicated by NA. Presence of missing data causes difficulty in calculation.
> mean(x$AGE.CODE)
[1] NA
therefore, use the command na.rm=True)
> mean(x$AGE.CODE,na.rm=T)
[1] 1.5
25. Table
> table(x$Anxiety.Disorder)
0 1
438 42
> table(x$Anxiety.Disorder,x$Somatoform.Disorder)
0 1
0 394 44
1 42 0
26. Chi-square test
> chisq.test(x$Anxiety.Disorder,x$Somatoform.Disorder)
Pearson's Chi-squared test with Yates' continuity correction
data: x$Anxiety.Disorder and x$Somatoform.Disorder
X-squared = 3.5168, df = 1, p-value = 0.06075

=========================================================================
19.11.17

Data Visualization in R (Lecture no. 27)

>a=c(2,3,4,5,6) # data creation
>b=c(10,20,30,40,50)
>save(a,b, file="e:/rdataset/my_data") # saving data to specific file
>rm (a,b)
>load(file="e:/rdataset/my_data") # Loading data
> a
[1] 2 3 4 5 6 # recovered data
> b
[1] 10 20 30 40 50


===================================================================


CONTROL STATEMENTS
Source: http://uc-r.github.io/control_statements

if statement
The conditional if statement is used to test an expression. If the test_expression is TRUE, the statement gets executed. But if it’s FALSE, nothing happens.

# syntax of if statement
if (test_expression) {
        statement
}
The following is an example that tests if any values in a vector are negative. Notice there are two ways to write this if statement; since the body of the statement is only one line you can write it with or without curly braces. I recommend getting in the habit of using curly braces, that way if you build onto if statements with additional functions in the body or add an else statement later you will not run into issues with unexpected code procedures.

x <- c(8, 3, -2, 5)

# without curly braces
if(any(x < 0)) print("x contains negative numbers")
## [1] "x contains negative numbers"

# with curly braces produces same result
if(any(x < 0)){
        print("x contains negative numbers")
}
## [1] "x contains negative numbers"

# an if statement in which the test expression is FALSE
# does not produce any output
y <- c(8, 3, 2, 5)

if(any(y < 0)){
        print("y contains negative numbers")
}


if...else statement
The conditional if...else statement is used to test an expression similar to the if statement. However, rather than nothing happening if the test_expression is FALSE, the else part of the function will be evaluated.

# syntax of if...else statement
if (test_expression) {
        statement 1
} else {
        statement 2
}
The following extends the previous example illustrated for the if statement in which the if statement tests if any values in a vector are negative; if TRUE it produces one output and if FALSE it produces the else output.

# this test results in statement 1 being executed
x <- c(8, 3, -2, 5)

if(any(x < 0)){
        print("x contains negative numbers")
} else{
        print("x contains all positive numbers")
}
## [1] "x contains negative numbers"

# this test results in statement 2 (or the else statement) being executed
y <- c(8, 3, 2, 5)

if(any(y < 0)){
        print("y contains negative numbers")
} else{
        print("y contains all positive numbers")
}
## [1] "y contains all positive numbers"
Simple if...else statements, as above, in which only one line of code is being executed in the statements can be written in a simplified alternative manner. These alternatives are only recommended for very short if...else code:

x <- c(8, 3, 2, 5)

# alternative 1
if(any(x < 0)) print("x contains negative numbers") else print("x contains all positive numbers")
## [1] "x contains all positive numbers"

# alternative 2 using the ifelse function
ifelse(any(x < 0), "x contains negative numbers", "x contains all positive numbers")
## [1] "x contains all positive numbers"
We can also nest as many if...else statements as required (or desired). For example:

# this test results in statement 1 being executed
x <- 7

if(x >= 10){
        print("x exceeds acceptable tolerance levels")
} else if(x >= 0 & x < 10){
        print("x is within acceptable tolerance levels")
} else {
         print("x is negative")
}
## [1] "x is within acceptable tolerance levels"


for loop
The for loop is used to execute repetitive code statements for a particular number of times. The general syntax is provided below where i is the counter and as i assumes each sequential value defined (1 through 100 in this example) the code in the body will be performed for that ith value.

# syntax of for loop
for(i in 1:100) {
        <do stuff here with i>
}
For example, the following for loop iterates through each value (2010, 2011, …, 2016) and performs the paste and print functions inside the curly brackets.

for (i in 2010:2016){
        output <- paste("The year is", i)
        print(output)
}
## [1] "The year is 2010"
## [1] "The year is 2011"
## [1] "The year is 2012"
## [1] "The year is 2013"
## [1] "The year is 2014"
## [1] "The year is 2015"
## [1] "The year is 2016"
If you want to perform the for loop but have the outputs combined into a vector or other data structure than you can initiate the output data structure prior to the for loop. For instance, if we want to have the previous outputs combined into a single vector x we can initiate x first and then append the for loop output to x.

x <- NULL

for (i in 2010:2016){
        output <- paste("The year is", i)
        x <- append(x, output)
}

x
## [1] "The year is 2010" "The year is 2011" "The year is 2012" "The year is 2013"
## [5] "The year is 2014" "The year is 2015" "The year is 2016"
However, an important lesson to learn is that R is not efficient at growing data objects. As a result, it is more efficient to create an empty data object and fill it with the for loop outputs. In the previous example we grew x by appending new values to it. A more efficient practice is to initiate a vector (or other data structure) of the right size and fill the elements. In the example that follows, we create the vector x of the right size and then fill in each element within the for loop. Although this inefficiency is not noticed in this small example, when you perform larger repetitions it will become noticable so you might as well get in the habit of filling rather than growing.

x <- vector(mode = "numeric", length = 7)
counter <- 1

for (i in 2010:2016){
        output <- paste("The year is", i)
        x[counter] <- output
        counter <- counter + 1
}

x
## [1] "The year is 2010" "The year is 2011" "The year is 2012" "The year is 2013"
## [5] "The year is 2014" "The year is 2015" "The year is 2016"
Another example in which we create an empty matrix with 5 rows and 5 columns. The for loop then iterates over each column (note how i takes on the values 1 through the number of columns in the my.mat matrix) and takes a random draw of 5 values from a poisson distribution with mean i in column i:

my.mat <- matrix(NA, nrow = 5, ncol = 5)

for(i in 1:ncol(my.mat)){
        my.mat[, i] <- rpois(5, lambda = i)
}
my.mat
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    0    2    1    7    1
## [2,]    1    2    2    3    9
## [3,]    2    1    5    6    6
## [4,]    2    1    5    2   10
## [5,]    0    2    2    2    4


while loop
While loops begin by testing a condition. If it is true, then they execute the statement. Once the statement is executed, the condition is tested again, and so forth, until the condition is false, after which the loop exits. It’s considered a best practice to include a counter object to keep track of total iterations

# syntax of while loop
counter <- 1

while(test_expression) {
        statement
        counter <- counter + 1
}
while loops can potentially result in infinite loops if not written properly; therefore, you must use them with care. To provide a simple example to illustrate how similiar for and while loops are:

counter <- 1

while(counter <= 10) {
        print(counter)
        counter <- counter + 1
}

# this for loop provides the same output
counter <- vector(mode = "numeric", length = 10)

for(i in 1:length(counter)) {
        print(i)
}
The primary difference between a for loop and a while loop is: a for loop is used when the number of iterations a code should be run is known where a while loop is used when the number of iterations is not known. For instance, the following takes value x and adds or subtracts 1 from the value randomly until x exceeds the values in the test expression. The output illustrates that the code runs 14 times until x exceeded the threshold with the value 9.

counter <- 1
x <- 5
set.seed(3)

while(x >= 3 && x <= 8 ) {
        coin <- rbinom(1, 1, 0.5)
        
        if(coin == 1) { ## random walk
                x <- x + 1
        } else {
                x <- x - 1
        }
        cat("On iteration", counter, ", x =", x, '\n')
        counter <- counter + 1
}
## On iteration 1 , x = 4 
## On iteration 2 , x = 5 
## On iteration 3 , x = 4 
## On iteration 4 , x = 3 
## On iteration 5 , x = 4 
## On iteration 6 , x = 5 
## On iteration 7 , x = 4 
## On iteration 8 , x = 3 
## On iteration 9 , x = 4 
## On iteration 10 , x = 5 
## On iteration 11 , x = 6 
## On iteration 12 , x = 7 
## On iteration 13 , x = 8 
## On iteration 14 , x = 9


repeat loop
A repeat loop is used to iterate over a block of code multiple number of times. There is test expression in a repeat loop to end or exit the loop. Rather, we must put a condition statement explicitly inside the body of the loop and use the break function to exit the loop. Failing to do so will result into an infinite loop.

# syntax of repeat loop
counter <- 1

repeat {
        statement
        
        if(test_expression){
                break
        }
        counter <- counter + 1
}
For example ,say we want to randomly draw values from a uniform distribution between 1 and 25. Furthermore, we want to continue to draw values randomly until our sample contains at least each integer value between 1 and 25; however, we do not care if we’ve drawn a particular value multiple times. The following code repeats the random draws of values between 1 and 25 (in which we round). We then include an if statement to check if all values between 1 and 25 are present in our sample. If so, we use the break statement to exit the loop. If not, we add to our counter and let the loop repeat until the conditional if statement is found to be true. We can then check the counter object to assess how many iterations were required to reach our conditional requirement.

counter <- 1
x <- NULL

repeat {
        x <- c(x, round(runif(1, min = 1, max = 25)))
        
        if(all(1:25 %in% x)){
                break
        }
                
        counter <- counter + 1
}

counter
## [1] 75


break/next arguments
The break argument is used to exit a loop immediately, regardless of what iteration the loop may be on. break arguments are typically embedded in an if statement in which a condition is assessed, if TRUE break out of the loop, if FALSE continue on with the loop. In a nested looping situation, where there is a loop inside another loop, this statement exits from the innermost loop that is being evaluated.

In this example, the for loop will iterate for each element in x; however, when it gets to the element that equals 3 it will break out and end the for loop process.

x <- 1:5

for (i in x) {
        if (i == 3){
                break
                }
        print(i)
}
## [1] 1
## [1] 2
The next argument is useful when we want to skip the current iteration of a loop without terminating it. On encountering next, the R parser skips further evaluation and starts the next iteration of the loop. In this example, the for loop will iterate for each element in x; however, when it gets to the element that equals 3 it will skip the for loop execution of printing the element and simply jump to the next iteration.

x <- 1:5

for (i in x) {
        if (i == 3){
                next
                }
        print(i)
}
## [1] 1
## [1] 2
## [1] 4

## [1] 5

========================================================================
> df=data.frame(sex,age)
> df
  sex age
1   1  10
2   2  12
3   1  13
4   2  10

>index1=which(df$sex==1)
>mean(df$age[index1])

*****
In this case,
One equal to sign is used to assign something to something

(For example, x=5 or x<-5 means we are assigning the value 5 to x, similarly, df=data.frame(x$sex,x$age) means we are assigning the data frame to df).

But while comparing, we should use double equal to sign.
(For example, if(x==5){ print(x) } 
Here, the line checks if x equals 5 or not,
If the check is correct, it considers the task inside the bracket, i.e. it prints x.)

So, first of all, while comparing, we should use df$x.sex=1.

Next, 
df$x.sex is a vector and if cannot take a vector argument directly (ifelse can do that, But we would come to that later)

Let's do from basic, start with a for loop:
>temporary1=0
>temporary2=0
>for(i in 1:length(df$x.sex)){
> if(df$x.sex[i]==1){
> temporary1=df$x.age[i]+temporary1
> temporary2=1+temporary2
> }
>}
>temporary1/temporary2

In this code part, first, we have taken two temporary variables named 'temporary1' and 'temporary2' and set these to 0.
Then, we have run i from i in 1,2,3,4,....,length(df$x.sex).
At each iteration i, it checks that if i-th point has first variable 1 or not; if it is 1, the second varible part at that i-th point is added to the variable 'temporary1' and 1 is added to 'temporary2' variable.
If the check says it is not 1, both the temporary variables remain the same.

Thus, when the whole loop is executed, temporary1 contains the sums of points where df$x.sex is equal to 1 and temporary 2 contains the count how many time this occurs.

So average is just sum divided by count. Hence, temporary1/temporary2 gives the mean.

The other and short way is (first step by step explained):

>index1=which(df$x.sex==1)
>index1

Say, first we decide which of the df$x.sex equals 1 and stores that in 'index1'.
Then, only take corresponding values from the other variable:

>df$x.age[index1]

Then we can see this is just giving those needed values only, simply take their mean

>mean(df$x.age[index1])

In short, we can do:

>index1=which(df$x.sex==1)
>mean(df$x.age[index1])
- Shri Subrata Pal


Another approach

> by( df$age, df$sex, mean)
df$sex: 1
[1] 13
------------------------------------------- 
df$sex: 2
[1] 12.33333

>===================================================================================

ANOVA

>Sex=c(1,2,1,2,1,2,1,2,1,2,1,2,1)
>age=c(10,12,10,13,10,12,13,12,21,31,13,14,15)
>boxplot(age~sex)
> ana=aov(age~sex)
> summary(ana)
            Df Sum Sq Mean Sq F value Pr(>F)
sex          1   20.6   20.58   0.595  0.457

Residuals   11  380.2   34.56  






=========================================================================
Learn Data Visualization With R
Change the directory
> dir () # locate directory
>hpl<-read.csv("RIASEC.csv",header=T,sep=",") #reading csv file
>hpl<-read.csv("RIASEC.csv")
>Str(hpl) # structure the object
>hpl<-read.table(file.choose(), header=T, sep=“,”) # Choosing file
>hpl
>hpl<-read.delim(file.choose(),header=T) # reading text file #Choosingdelimited text file
>names(hpl) #naming vectors
>hpl[1,1] # first row and first col
>hpl[, 4]
>hpl$Age
>mean(Age) #average
>sd(Age) # standard deviation
>sd(Age)/mean(Age)*100 # Coefficient of variation
>hist(Age) # histogram
>hist( Age, breaks=20) #20 bins
>plot(Age ~ y) # scatterplot
>stem(Age) # Stem leaf plot
kids = factor(c(1,0,1,0,0,0), levels = c(0, 1),
                labels = c("boy", "girl")) # labeling
x1=as.numeric(x) # conversion from character to numeric
# Matrix
> xx = matrix(1:6, nrow=3, ncol =2)
> xx
     [,1] [,2]
[1,]    1    4
[2,]    2    5

[3,]    3    6

FILE SAVING AND LOADING

write.csv(xdata,file="E:\\rdataset\\myfile.csv",row.names=F)
# xdata is the object
file="E:\\rdataset\\myfile.csv" is the source
#row.names=FALSE   means no new header

Saving file
saveRDS(xdata,"xdata.rds")
# This file can be loaded using open file of the window or

a=readRDS("C://Users//Dr.D.D.Roy//Documents//xdata.rds")
# This command will open the object file
#read the stored file in a
a=readRDS("C://Users//Dr.D.D.Roy//
Documents//xdata.rds")

No comments:

Post a Comment