4 Manipulating data tables

How to merge tables and how to split tables.

4.1 Merging data tables

Be very careful when merging data tables by combining them by columns: the order of the rows must be the same in both tables! It is a good practice to try to make some graphs to visualise and check manually some values to see that everything matches.

When reading files, be sure that these files are present in the working directory

# Cleaning the memory before starting
rm(list=ls())

## Reading files we saved last time:
xy = read.table("coordinates.txt") # default settings
soil.data = read.table("soil.data.csv", sep = ";", header = T) # defining delimiting character (here ";") and that we have column names (header=T)

## As we did last time, let's split soil data into two using the function subset
high.ph.soils = subset(soil.data, pH.KCl > median(pH.KCl))
low.ph.soils = subset(soil.data, pH.KCl <= median(pH.KCl))

combine1 = cbind(xy, soil.data) # putting together columns
head(combine1)
##        lon      lat Proovi.nimi pH.KCl       N..  P.mg.kg   K.mg.kg Proovi.nimi
## 1 26.45699 58.24387    Vapramae   3.72 0.4246261 31.57133  79.04454    Vapramae
## 2 26.43730 58.19846        Illi   3.65 0.1364021 22.26459  42.73691        Illi
## 3 26.41474 58.15658    Vitipalu   4.25 0.2500752 58.04788  60.71264    Vitipalu
## 4 26.27875 58.21060     Konguta   4.40 2.3192906 63.00445 131.18302     Konguta
## 5 26.15339 58.22432     Vehendi   2.62 0.3698321 25.62339 131.56542     Vehendi
## 6 26.32119 58.28426      Erumae   3.28 0.2426881 16.21594  66.16427      Erumae

combine2 = rbind(high.ph.soils, low.ph.soils) # putting together rows
dim(combine2)
## [1] 30  5

combine3 = cbind(xy, combine2) # trying again, but ....
plot(combine1$lon, combine1$pH.KCl)
points(combine3$lon, combine3$pH.KCl, pch = "x", col = "green") # adding points, symbol "x"
head(combine3) # note that site names from xy and new soil data do not match!!
##        lon      lat Proovi.nimi pH.KCl       N..   P.mg.kg   K.mg.kg
## 1 26.45699 58.24387    Vapramae   3.72 0.4246261 31.571334  79.04454
## 2 26.43730 58.19846        Illi   3.65 0.1364021 22.264589  42.73691
## 3 26.41474 58.15658    Vitipalu   4.25 0.2500752 58.047875  60.71264
## 4 26.27875 58.21060     Konguta   4.40 2.3192906 63.004453 131.18302
## 5 26.15339 58.22432     Vehendi   5.77 0.2557405  8.214731  99.41711
## 6 26.32119 58.28426      Erumae   4.01 0.2749614 13.283927  64.78593
##   Proovi.nimi
## 1    Vapramae
## 2        Illi
## 3    Vitipalu
## 4     Konguta
## 5    Porgumae
## 6      Aardla

# The 'merge' function can help in merging two datasets that share a common column with unique identifiers, like the "Proovi.nimi" one in our example:
combine3 = merge(xy, combine2, by = "Proovi.nimi") # merging by common column
head(combine3)
##   Proovi.nimi      lon      lat pH.KCl       N..   P.mg.kg   K.mg.kg
## 1      Aardla 26.74540 58.32910   4.01 0.2749614 13.283927  64.78593
## 2       Aravu 27.42453 58.25486   2.75 0.4787677 24.440573 107.38674
## 3      Erumae 26.32119 58.28426   3.28 0.2426881 16.215942  66.16427
## 4  Haavametsa 27.36984 58.26345   2.69 0.1794443  8.860196  65.97427
## 5      Ignase 26.81712 58.26367   3.40 0.2200201 37.057869  79.17404
## 6        Illi 26.43730 58.19846   3.65 0.1364021 22.264589  42.73691
points(combine3$lon, combine3$pH.KCl, pch = "x", col = "red") # now OK!

# an alternative with the function 'order':
combine4 = cbind(xy[order(xy$Proovi.nimi), ], combine2[order(combine2$Proovi.nimi), ])
points(combine4$lon, combine4$pH.KCl, pch = "+", col = "blue")

4.2 Community data: sites x taxa matrix

Community data is usually a matrix of taxa x sites. In R it is tradition to have rows as sites (samples) and columns as species (taxa). In the field often the opposite direction is used in books, so be careful with this when transcribing your data into R. Here we read a table of forest floor vascular plants from 10x10 m plots.


vas.plants = read.table("vascular.plants.txt")

dim(vas.plants)
## [1] 79 30

vas.plants[1:5, 1:5] # taxa as rows, sites as columns
##            X001Vapramae X002Illi X003Vitipalu X004Konguta X005Vehendi
## ACTAspic              0        0            0           0           0
## AEGOpoda              0        0            1           0           0
## ANEMnemo              0        0            1           0           0
## ASAReuro              1        0            0           0           0
## ATHYfil.fe            0        0            0           0           0

vas.plants = t(vas.plants) # transposing the matrix (replacing rows and columns)

vas.plants = as.data.frame(vas.plants) # transposing loose data.frame sturcture, putting back
vas.plants[1:5, 1:5]
##              ACTAspic AEGOpoda ANEMnemo ASAReuro ATHYfil.fe
## X001Vapramae        0        0        0        1          0
## X002Illi            0        0        0        0          0
## X003Vitipalu        0        1        1        0          0
## X004Konguta         0        0        0        0          0
## X005Vehendi         0        0        0        0          0

image(t(vas.plants),
      axes = F,
      ylab = "Sites",
      xlab = "Taxa") ## Quick visualisation of a table


apply(vas.plants, 1, FUN = max)  # a function can be applied to each row (1) or column (2). Here we find max of each row.
##   X001Vapramae       X002Illi   X003Vitipalu    X004Konguta    X005Vehendi 
##              3              3              3              5              5 
##    X006.Erumae   X007Porgumae     X008Aardla   X009Kurepalu  X010Sarakuste 
##              3              2              3              3              3 
##      X011Kannu      X012Vonnu     X013Rookse    X014Kammeri     X015Kambja 
##              3              3              4              3              3 
##      X016Reola     X017Ignase    X018Unikula X019Haavametsa X020Kruusahaua 
##              3              5              5              3              3 
##      X021Aravu  X026Pedajamae       X027Miti    X028Puhaste  X031Pimmelaan 
##              5              4              3              4              4 
##     X032Logina  X033Voorepalu X036Taevaskoja    X044Savimae X073Pausakunnu 
##              2              4              3              2              2

Use function apply and calculate max values for each taxon in the object vas.plants.

Answer
apply(vas.plants, 2, FUN = max)
##   ACTAspic   AEGOpoda   ANEMnemo   ASAReuro ATHYfil.fe   BRACpinn   CALAarun 
##        1.0        3.0        2.0        1.0        1.0        1.0        3.0 
##   CALLvulg   CAREdigi   CIRCalpi   CONVmaja   DESCflex   DRYOcart   DRYOexpa 
##        3.0        2.0        2.0        3.0        1.0        3.0        1.0 
##   EPILangu   FESTovin   FRAGvesc   GALElute   GERAsylv   GYMNdryo   HEPAnobi 
##        1.0        1.0        2.0        3.0        1.0        1.0        3.0 
##   IMPAnoli   LATHvern   LUZUpilo   LYCOanno   MAIAbifo   MELAprat   MELInuta 
##        0.5        1.0        2.0        5.0        3.0        4.0        1.0 
##   MILIeffu   MOLIcaer   MYCEmura   ORTHsecu   OXALacet   PARIquad   PTERaqui 
##        3.0        1.0        1.0        2.0        5.0        1.0        2.0 
##   RANUcass   RUBUsaxa   STELholo   STELnemo   TRIEeuro   VACCmyrt VACCvit.id 
##        1.0        3.0        2.0        1.0        2.0        5.0        3.0 
##   VEROcham   VIOLmira   ANGEsylv   ANTHsylv   CAMPpers   CAREcane   CAREglob 
##        1.0        1.0        1.0        1.0        0.5        1.0        1.0 
##   CHIMumbe   CREPpalu   DESCcaes   DRYOfili   EQUIprat   EQUIsylv   GALEOPsp 
##        1.0        2.0        1.0        3.0        1.0        2.0        1.0 
##  GALIalbu.   GALIpalu   GOODrepe   HIERvulg   IMPAparv   LYSIvulg   MELAnemo 
##        1.0        1.0        1.0        2.0        3.0        1.0        1.0 
##   MELAsylv   MOERtrin   PHEGconn    POAnemo   POLYodor   POTEerec   PULMobsc 
##        1.0        1.0        1.0        1.0        1.0        1.0        1.0 
##   PYROmino  RANUauri.   SCORhumi   SOLIvirg   URTIdioi   VEROoffi   VIOLcani 
##        1.0        1.0        1.0        2.0        1.0        1.0        1.0 
##   VIOLepip  VIOLrivi. 
##        1.0        1.0

4.3 Long data format

Data matrices might have too many zeros since most taxa are rare. Because of this, a “long” format, with columns indicating site, taxon and some abundance measure is often used. Here we read an example of long format from trees in forest sites from an excel file.

library(readxl) # package to read excel files

data = read_excel("trees.xlsx")
data = as.data.frame(data)      # to make the file data.frame
head(data)
##         ala nr rinne puuliik   D1   D2    H
## 1 008Aardla  1     1      PN   43 41.5 30.1
## 2 008Aardla  2     1      TA 30.5 31.5 21.5
## 3 008Aardla  3     1      HB   19 19.5 21.3
## 4 008Aardla  4     2      TM    7  8.0 10.4
## 5 008Aardla  5     2      TM  7.5  7.0 11.0
## 6 008Aardla  6     2      TM    8  7.0 12.4

# ala = site, puuliik = species (codes of tree species)

tree.counts = table(data[, c(1, 4)])  # counts of occurrences (number of trees)
tree.height = xtabs(H ~ ala + puuliik, data = data) # sum of H (height)

str(tree.counts) # from function table we obtain a bit special format, we better make it a simple data.frame by function as.data.frame
##  'table' int [1:30, 1:12] 0 0 0 0 0 0 0 3 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ ala    : chr [1:30] "001Vapramäe" "002Illi" "003Vitipalu" "004Konguta" ...
##   ..$ puuliik: chr [1:12] "HB" "KS" "KU" "LH" ...
tree.counts = as.data.frame.matrix(tree.counts)
str(tree.counts)
## 'data.frame':    30 obs. of  12 variables:
##  $ HB: int  0 0 0 0 0 0 0 3 0 0 ...
##  $ KS: int  0 1 3 0 0 0 0 1 0 2 ...
##  $ KU: int  3 1 8 8 0 8 5 0 3 6 ...
##  $ LH: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MA: int  3 4 1 2 5 2 0 0 3 1 ...
##  $ MN: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PI: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ PN: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ RE: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ TA: int  0 0 0 0 0 0 1 1 0 0 ...
##  $ TM: int  0 0 0 0 0 0 0 4 0 0 ...
##  $ VA: int  0 0 0 0 0 0 0 0 0 0 ...

4.4 NA in data

For missing values in a table, NA is automatically used.


summary(data) # Note D2
##      ala                  nr             rinne         puuliik         
##  Length:239         Min.   : 1.000   Min.   :1.000   Length:239        
##  Class :character   1st Qu.: 3.000   1st Qu.:1.000   Class :character  
##  Mode  :character   Median : 5.000   Median :1.000   Mode  :character  
##                     Mean   : 5.188   Mean   :1.439                     
##                     3rd Qu.: 7.000   3rd Qu.:2.000                     
##                     Max.   :16.000   Max.   :2.000                     
##                                                                        
##       D1                  D2              H        
##  Length:239         Min.   : 5.20   Min.   : 6.00  
##  Class :character   1st Qu.:12.00   1st Qu.:13.10  
##  Mode  :character   Median :24.50   Median :22.00  
##                     Mean   :24.96   Mean   :22.40  
##                     3rd Qu.:37.75   3rd Qu.:30.85  
##                     Max.   :54.00   Max.   :44.30  
##                     NA's   :9
complete.cases(data) # do we have real values for a row
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
##  [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [73]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [109]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [157]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [169]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [205]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [217]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [229]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

data[!complete.cases(data), ] # not complete cases (note "!")
##              ala nr rinne puuliik     D1 D2    H
## 11    073Sulaoja  1     1      MA   ü177 NA 24.9
## 20    073Sulaoja 11     1      MA   ü189 NA 23.9
## 21     015Kambja  1     1      KU ü166.5 NA 35.0
## 24     015Kambja  5     1      KS   ü229 NA 36.4
## 36   007Põrgumäe  1     1      TA   ü174 NA 33.0
## 44      016Reola  4     1      KU   ü248 NA 34.6
## 77   003Vitipalu  9     1      MA    ü82 NA 36.8
## 113   028Puhaste  2     1      MA   ü203 NA 34.9
## 129 031Pimmelaan  2     1      MA   ü199 NA 36.4

is.na(data$D2) # where we have NA values?
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

4.5 Combining community and site data

Before we analyse community and site data together, it is suggested to check if we have the same order in both tables. We can list rownames and site names side by side



# Checking that the order is the same

cbind(row.names(vas.plants), as.character(xy$Proovi.nimi))
##       [,1]             [,2]        
##  [1,] "X001Vapramae"   "Vapramae"  
##  [2,] "X002Illi"       "Illi"      
##  [3,] "X003Vitipalu"   "Vitipalu"  
##  [4,] "X004Konguta"    "Konguta"   
##  [5,] "X005Vehendi"    "Vehendi"   
##  [6,] "X006.Erumae"    "Erumae"    
##  [7,] "X007Porgumae"   "Porgumae"  
##  [8,] "X008Aardla"     "Aardla"    
##  [9,] "X009Kurepalu"   "Kurepalu"  
## [10,] "X010Sarakuste"  "Sarakuste" 
## [11,] "X011Kannu"      "Kannu"     
## [12,] "X012Vonnu"      "Vonnu"     
## [13,] "X013Rookse"     "Rookse"    
## [14,] "X014Kammeri"    "Kammeri"   
## [15,] "X015Kambja"     "Kambja"    
## [16,] "X016Reola"      "Reola"     
## [17,] "X017Ignase"     "Ignase"    
## [18,] "X018Unikula"    "Unikula"   
## [19,] "X019Haavametsa" "Haavametsa"
## [20,] "X020Kruusahaua" "Kruusahaua"
## [21,] "X021Aravu"      "Aravu"     
## [22,] "X026Pedajamae"  "Pedajamae" 
## [23,] "X027Miti"       "Miti"      
## [24,] "X028Puhaste"    "Puhaste"   
## [25,] "X031Pimmelaan"  "Pimmelaan" 
## [26,] "X032Logina"     "Logina"    
## [27,] "X033Voorepalu"  "Voorepalu" 
## [28,] "X036Taevaskoja" "Taevaskoja"
## [29,] "X044Savimae"    "Savimae"   
## [30,] "X073Pausakunnu" "Sulaoja"

# Species distribution maps

i=3  # id of a species
plot(xy[,1:2],cex=0.3) ## Coordinates of sites and small points
points(xy[,1:2], cex=vas.plants[,i]*2,col="green") # cex changes point size relative to 1
title(main=colnames(vas.plants)[i]) # adding title taking the i-th colname from the community data

4.6 Some text manipulations

If we have names of sites and species, we might also need some text manipulations to cut, paste, search and replace parts of texts.


sites1 = row.names(vas.plants)
sites1
##  [1] "X001Vapramae"   "X002Illi"       "X003Vitipalu"   "X004Konguta"   
##  [5] "X005Vehendi"    "X006.Erumae"    "X007Porgumae"   "X008Aardla"    
##  [9] "X009Kurepalu"   "X010Sarakuste"  "X011Kannu"      "X012Vonnu"     
## [13] "X013Rookse"     "X014Kammeri"    "X015Kambja"     "X016Reola"     
## [17] "X017Ignase"     "X018Unikula"    "X019Haavametsa" "X020Kruusahaua"
## [21] "X021Aravu"      "X026Pedajamae"  "X027Miti"       "X028Puhaste"   
## [25] "X031Pimmelaan"  "X032Logina"     "X033Voorepalu"  "X036Taevaskoja"
## [29] "X044Savimae"    "X073Pausakunnu"

nchar(sites1) # how many characters (length of the text)
##  [1] 12  8 12 11 11 11 12 10 12 13  9  9 10 11 10  9 10 11 14 14  9 13  8 11 13
## [26] 10 13 14 11 14

sites2 = as.character(soil.data$Proovi.nimi) # Making character from a factor data type
sites2
##  [1] "Vapramae"   "Illi"       "Vitipalu"   "Konguta"    "Vehendi"   
##  [6] "Erumae"     "Porgumae"   "Aardla"     "Kurepalu"   "Sarakuste" 
## [11] "Kannu"      "Vonnu"      "Rookse"     "Kammeri"    "Kambja"    
## [16] "Reola"      "Ignase"     "Unikula"    "Haavametsa" "Kruusahaua"
## [21] "Aravu"      "Pedajamae"  "Miti"       "Puhaste"    "Pimmelaan" 
## [26] "Logina"     "Voorepalu"  "Taevaskoja" "Savimae"    "Sulaoja"

substring(sites1, 2, 4) # substring, from 2nd to 4th character
##  [1] "001" "002" "003" "004" "005" "006" "007" "008" "009" "010" "011" "012"
## [13] "013" "014" "015" "016" "017" "018" "019" "020" "021" "026" "027" "028"
## [25] "031" "032" "033" "036" "044" "073"

paste(1:30, sites2, sep = "-") # pasting different texts (non-text will be converted)
##  [1] "1-Vapramae"    "2-Illi"        "3-Vitipalu"    "4-Konguta"    
##  [5] "5-Vehendi"     "6-Erumae"      "7-Porgumae"    "8-Aardla"     
##  [9] "9-Kurepalu"    "10-Sarakuste"  "11-Kannu"      "12-Vonnu"     
## [13] "13-Rookse"     "14-Kammeri"    "15-Kambja"     "16-Reola"     
## [17] "17-Ignase"     "18-Unikula"    "19-Haavametsa" "20-Kruusahaua"
## [21] "21-Aravu"      "22-Pedajamae"  "23-Miti"       "24-Puhaste"   
## [25] "25-Pimmelaan"  "26-Logina"     "27-Voorepalu"  "28-Taevaskoja"
## [29] "29-Savimae"    "30-Sulaoja"


grep("mae", sites1) # numbers of cases with a search string
## [1]  1  6  7 22 29
sites1[grep("mae", sites1)]
## [1] "X001Vapramae"  "X006.Erumae"   "X007Porgumae"  "X026Pedajamae"
## [5] "X044Savimae"
sites1[grep("\\.", sites1)] # some characters as "." "?" have a special meaning but we can search them by leading "\\"
## [1] "X006.Erumae"

gsub("mae", "oru", sites1) # replacing a search string
##  [1] "X001Vapraoru"   "X002Illi"       "X003Vitipalu"   "X004Konguta"   
##  [5] "X005Vehendi"    "X006.Eruoru"    "X007Porguoru"   "X008Aardla"    
##  [9] "X009Kurepalu"   "X010Sarakuste"  "X011Kannu"      "X012Vonnu"     
## [13] "X013Rookse"     "X014Kammeri"    "X015Kambja"     "X016Reola"     
## [17] "X017Ignase"     "X018Unikula"    "X019Haavametsa" "X020Kruusahaua"
## [21] "X021Aravu"      "X026Pedajaoru"  "X027Miti"       "X028Puhaste"   
## [25] "X031Pimmelaan"  "X032Logina"     "X033Voorepalu"  "X036Taevaskoja"
## [29] "X044Savioru"    "X073Pausakunnu"

Make a character vector from sites1 without leading “X”

Answer
substring(sites1, 2, 9999)
##  [1] "001Vapramae"   "002Illi"       "003Vitipalu"   "004Konguta"   
##  [5] "005Vehendi"    "006.Erumae"    "007Porgumae"   "008Aardla"    
##  [9] "009Kurepalu"   "010Sarakuste"  "011Kannu"      "012Vonnu"     
## [13] "013Rookse"     "014Kammeri"    "015Kambja"     "016Reola"     
## [17] "017Ignase"     "018Unikula"    "019Haavametsa" "020Kruusahaua"
## [21] "021Aravu"      "026Pedajamae"  "027Miti"       "028Puhaste"   
## [25] "031Pimmelaan"  "032Logina"     "033Voorepalu"  "036Taevaskoja"
## [29] "044Savimae"    "073Pausakunnu"

4.7 Manipulating sets

We make a set of forest type codes. In the database we have 4 digits but we use a broader classification of three digits. Then we can find how two sets differ.


forest.types = read.table("envir.txt")$forest.types
forest.types
##  [1] 1152 1131 1153 1512 1132 1132 1161 1162 1132 1161 1161 1142 1161 1141 1161
## [16] 1142 1142 1153 1132 1161 1511 1141 1132 1132 1161 1511 1132 1132 1511 1142

forest.types = as.character(forest.types) # numbers do not have meaning, just codes
forest.types
##  [1] "1152" "1131" "1153" "1512" "1132" "1132" "1161" "1162" "1132" "1161"
## [11] "1161" "1142" "1161" "1141" "1161" "1142" "1142" "1153" "1132" "1161"
## [21] "1511" "1141" "1132" "1132" "1161" "1511" "1132" "1132" "1511" "1142"

forest.types = substring(forest.types, 1, 3)
forest.types
##  [1] "115" "113" "115" "151" "113" "113" "116" "116" "113" "116" "116" "114"
## [13] "116" "114" "116" "114" "114" "115" "113" "116" "151" "114" "113" "113"
## [25] "116" "151" "113" "113" "151" "114"

unique(forest.types) # set of unique values
## [1] "115" "113" "151" "116" "114"
table(forest.types) # frequency table
## forest.types
## 113 114 115 116 151 
##   9   6   3   8   4

# separating 2 groups according to soil pH
forest.types.1 = forest.types[soil.data$pH.KCl < 3]
forest.types.2 = forest.types[soil.data$pH.KCl >= 3]

forest.types.1
## [1] "113" "113" "114" "114" "113" "151" "113" "113" "151"
forest.types.2
##  [1] "115" "113" "115" "151" "113" "116" "116" "116" "116" "114" "116" "116"
## [13] "114" "115" "116" "114" "113" "116" "151" "113" "114"

union(forest.types.1, forest.types.2) # from two sets (here same as unique of all)
## [1] "113" "114" "151" "115" "116"
intersect(forest.types.1, forest.types.2) # in both
## [1] "113" "114" "151"
setdiff(forest.types.2, forest.types.1) # in set 2 but not in set 1
## [1] "115" "116"
setdiff(forest.types.1, forest.types.2) # None!
## character(0)

is.element(forest.types.2, forest.types.1) # asking which in set 2 are present in set 1
##  [1] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [13]  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE

4.8 Making our own function to combine two datasets

Sometimes we want to combine different files which include different sites but species composition do not match. I have not found a good function in R for that. But we can make our own functions!


# We start with some artificial examples, just two species in both sets

t1 = data.frame(a = c(1, 0, 1), b = c(2, 2, 0))
t2 = data.frame(b = c(3, 0), c = c(0, 4))

t1
##   a b
## 1 1 2
## 2 0 2
## 3 1 0
t2
##   b c
## 1 3 0
## 2 0 4



t2[, setdiff(names(t1), names(t2))] = 0 # adding now species only found in table1 to table2 as zeros
t1[, setdiff(names(t2), names(t1))] = 0 # same for table1

t1
##   a b c
## 1 1 2 0
## 2 0 2 0
## 3 1 0 0
t2
##   b c a
## 1 3 0 0
## 2 0 4 0

rbind(t1, t2) # now mergind two tables (rbind looks names!)
##   a b c
## 1 1 2 0
## 2 0 2 0
## 3 1 0 0
## 4 0 3 0
## 5 0 0 4


# Let's make a function from script above!

tables.join = function(t1, t2) {
  # asking two tables which got names t1 and t2
  t2[, setdiff(names(t1), names(t2))] = 0
  t1[, setdiff(names(t2), names(t1))] = 0
  t12 = rbind(t1, t2)
  return(t12) # returns value from function
}

tables.join(t1, t2)
##   a b c
## 1 1 2 0
## 2 0 2 0
## 3 1 0 0
## 4 0 3 0
## 5 0 0 4

t12 # objects within a functions are not kept in memory!
## Error in eval(expr, envir, enclos): object 't12' not found

When using this function you can specify your input objects (let’s imagine that these are called a.table and b.table) either by defining function parameters: tables.join(t1=a.table,t2=b.table), or just by position tables.join(a.table,b.table)

4.9 Saving community data for the future

save(vas.plants,
     tree.counts,
     forest.types,
     xy,
     soil.data,
     tables.join,
     file = "community.rda")