[R] 基本資料架構

程式語言:R
官網
官方文件

簡介:基本資料架構

基本

  • 所有 index 皆是從 1 開始,而不是 0
  • class(x) 
    • 判斷 x 架構,像是 matrix
  • mode(x) 
    • 判斷 x 內含的資料屬性,像是 numeric
  • typeof(x)
    • 判斷更精確的 x 內含資料屬性,像是 double
  • attributes(x)
    • 得到 x 的屬性,像是 dim

資料屬性

  • character
    • 字串
    • class("test")
  • numeric
    • 實數
    • R 計算上皆是實數計算,除非特別轉換
    • class(10)
  • integer
    • 整數
    • class(1:10) 
    • class(as.integer(10))
  • complex
    • 複數
    • class(2+2i)
  • logical
    • True 或 False
    • class(T)
  • raw
    • bytes
    • class(charToRaw('A'))

Assignment

# 正常做法
x <- c(1,2,3)
# 反著寫也可以
c(1,2,3) -> x
# 用 function
assign("x", c(1,2,3))
# 大部分情況等同 ->,但不建議使用
x = c(1,2,3)

Vector

vector 必定是同個資料屬性,會自動轉換,依以下強弱順序
character > complex > numeric > integer > logical

宣告
  1. # by c()
  2. v <- c(1, 3, 5)
  3. # by seq
  4. v <- seq(1, 5, by=2)
  5. # by :
  6. v <- 5:1
  7. # by rep
  8. v <- rep(1, times=5)
  9. # boolean
  10. v <- c(1, 3, 5) > 3
  11. # is.na
  12. v <- is.na(c(1:3,NA))
  13. # character
  14. v <- paste(c('x', 'y'), 1:5, sep="")

得值
  1. # init
  2. v <- c(seq(1, 10, by=2), NA)
  3. vLen = length(v)
  4. vNames <- as.raw(0x41:(0x41+vLen-1))
  5. vNames <- rawToChar(vNames)
  6. vNames <- strsplit(vNames, split='')
  7. vNames <- unlist(vNames)
  8. names(v) <- vNames
  9. print(v)
  10. # A B C D E F
  11. # 1 3 5 7 9 NA
  12.  
  13. # index 從 1 開始,而不是 0
  14. v[1:10]
  15. # A B C D E F <NA> <NA> <NA> <NA>
  16. # 1 3 5 7 9 NA NA NA NA NA
  17.  
  18. v[!is.na(v) & v>5]
  19. # D E
  20. # 7 9
  21.  
  22. # 排除前兩項
  23. v[-(1:2)]
  24. # C D E F
  25. # 5 7 9 NA
  26.  
  27. v[c('A', 'D')]
  28. # A D
  29. # 1 7
  30.  
  31. # 將大於 3 的值設為負的
  32. index <- !is.na(v) & v>3
  33. v[index] = -v[index]
  34. print(v)
  35. # A B C D E F
  36. # 1 3 -5 -7 -9 NA

Array


宣告
  1. # 將 z 轉換為三維陣列,大小為 2x3x4
  2. z <- 1:24
  3. dim(z) <- c(2,3,4)
  4.  
  5. # 初始值為 0 的 2x3 陣列
  6. array(0, c(2,3))

得值
  1. # 建立三維陣列,大小為 2x3x4
  2. z <- array(1:24, dim=c(2,3,4))
  3. # , , 1
  4. #
  5. # [,1] [,2] [,3]
  6. # [1,] 1 3 5
  7. # [2,] 2 4 6
  8. #
  9. # , , 2
  10. #
  11. # [,1] [,2] [,3]
  12. # [1,] 7 9 11
  13. # [2,] 8 10 12
  14. #
  15. # , , 3
  16. #
  17. # [,1] [,2] [,3]
  18. # [1,] 13 15 17
  19. # [2,] 14 16 18
  20. #
  21. # , , 4
  22. #
  23. # [,1] [,2] [,3]
  24. # [1,] 19 21 23
  25. # [2,] 20 22 24
  26.  
  27. # 得到位罝為 (1,2,3) 的值
  28. z[1,2,3]
  29. # [1] 15
  30.  
  31. # 得到所有 x 位罝為 2 的值,大小為 3x4
  32. z[2,,]
  33. # [,1] [,2] [,3] [,4]
  34. # [1,] 2 8 14 20
  35. # [2,] 4 10 16 22
  36. # [3,] 6 12 18 24

Matrix

二維 array 即是 matrix,class 皆為 matrix

宣告
  1. a <- array(0, c(2,3))
  2. # [,1] [,2] [,3]
  3. # [1,] 0 0 0
  4. # [2,] 0 0 0
  5.  
  6. m <- matrix(0,nrow=2, ncol=3)
  7. # [,1] [,2] [,3]
  8. # [1,] 0 0 0
  9. # [2,] 0 0 0
  10.  
  11. class(a) == class(m)
  12. # [1] TRUE
  13.  
  14. matrix(c(1:4), nrow=2, ncol=2)
  15. # [,1] [,2]
  16. # [1,] 1 3
  17. # [2,] 2 4
  18.  
  19. # 可以更改成按照 row 填入資料
  20. matrix(c(1:4), nrow=2, ncol=2, byrow=TRUE)
  21. # [,1] [,2]
  22. # [1,] 1 2
  23. # [2,] 3 4
  24.  
  25. matrix(c(1:4), nrow=2, ncol=2, dimnames=list(c('A', 'B'),c('a', 'b')))
  26. # a b
  27. # A 1 3
  28. # B 2 4
  29.  
  30. # 同上
  31. m2 <- matrix(c(1:4), nrow=2, ncol=2)
  32. rownames(m2) <- c('A', 'B')
  33. colnames(m2) <- c('a', 'b')
  34. print(m2)
  35. # a b
  36. # A 1 3
  37. # B 2 4

得值
  1. m = matrix(c(1:6), nrow=2, ncol=3, dimnames=list(c('A', 'B'), c('a', 'b', 'c')), byrow=T)
  2. # a b c
  3. # A 1 2 3
  4. # B 4 5 6
  5.  
  6. # 得 a 行
  7. m[,'a']
  8. # A B
  9. # 1 4
  10.  
  11. # 得 A 欄
  12. m[1,]
  13. # a b c
  14. # 1 2 3
  15.  
  16. # 多加一欄
  17. rbind(m, C=7:9)
  18. # a b c
  19. # A 1 2 3
  20. # B 4 5 6
  21. # C 7 8 9
  22.  
  23. # 多加一行
  24. cbind(m, 7:8)
  25. # a b c
  26. # A 1 2 3 7
  27. # B 4 5 6 8

Factors

資料分類用

宣告
  1. # 無序
  2. survey_vector <- c("M", "F", "F", "M", "M")
  3. factor_survey_vector <- factor(survey_vector)
  4. factor_survey_vector
  5. # [1] M F F M M
  6. # Levels: F M
  7.  
  8. # 因無序,比較無意義
  9. factor_survey_vector[1] > factor_survey_vector[2]
  10. # [1] NA
  11. # Warning message:
  12. # In Ops.factor(factor_survey_vector[1], factor_survey_vector[2]) :
  13. # ‘>’ not meaningful for factors
  14.  
  15. # 有序,因溫度有高下之分
  16. temperature_vector <- c("High", "Low", "High","Low", "Medium")
  17. factor_temperature_vector <- factor(temperature_vector, order=TRUE, levels=c("Low", "Medium", "High"))
  18. factor_temperature_vector
  19. # [1] High Low High Low Medium
  20. # Levels: Low < Medium < High
  21.  
  22. # 因有序,比較才有意義
  23. factor_temperature_vector[1] > factor_temperature_vector[2]
  24. # [1] TRUE

運用
  1. survey_vector <- c("M", "F", "F", "M", "M")
  2. factor_survey_vector <- factor(survey_vector)
  3. factor_survey_vector
  4. # [1] M F F M M
  5. # Levels: F M
  6.  
  7. # 更改值
  8. levels(factor_survey_vector) <- c("Female", "Male")
  9.  
  10. factor_survey_vector
  11. # [1] Male Female Female Male Male
  12. # Levels: Female Male
  13.  
  14. # 得到第五個值
  15. factor_survey_vector[5]
  16. # [1] Male
  17. # Levels: Female Male
  18.  
  19. summary(factor_survey_vector)
  20. # Female Male
  21. # 2 3
  1. # tapply 用法
  2. incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
  3. 61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
  4. 59, 46, 58, 43)
  5.  
  6. state <- c("tas", "sa", "qld", "nsw", "nsw", "nt", "wa", "wa",
  7. "qld", "vic", "nsw", "vic", "qld", "qld", "sa", "tas",
  8. "sa", "nt", "wa", "vic", "qld", "nsw", "nsw", "wa",
  9. "sa", "act", "nsw", "vic", "vic", "act")
  10.  
  11. statef <- factor(state)
  12. # [1] tas sa qld nsw nsw nt wa wa qld vic nsw vic qld qld sa tas sa nt wa
  13. # [20] vic qld nsw nsw wa sa act nsw vic vic act
  14. # Levels: act nsw nt qld sa tas vic wa
  15.  
  16. levels(statef)
  17. # [1] "act" "nsw" "nt" "qld" "sa" "tas" "vic" "wa"
  18.  
  19. # 算出每個 group 的 mean
  20. incmeans <- tapply(incomes, statef, mean)
  21. # act nsw nt qld sa tas vic wa
  22. # 44.50000 57.33333 55.50000 53.60000 55.00000 60.50000 56.00000 52.25000

Lists

list 可以包含不同資料屬性的資料

宣告
  1. v = c(1, 2, 3)
  2. x <- list(a=1, b=TRUE, c="test", d=v)
  3. # $a
  4. # [1] 1
  5. #
  6. # $b
  7. # [1] TRUE
  8. #
  9. # $c
  10. # [1] "test"
  11. #
  12. # $d
  13. # [1] 1 2 3
  14.  
  15. a=1; b=TRUE; c="test"; d=c(1, 2, 3)
  16. x <- list(a, b, c, d)
  17. # 可以看到無名字,例 [[1]],因為不是直接用 = 的方法
  18. print(x)
  19. # [[1]]
  20. # [1] 1
  21. #
  22. # [[2]]
  23. # [1] TRUE
  24. #
  25. # [[3]]
  26. # [1] "test"
  27. #
  28. # [[4]]
  29. # [1] 1 2 3
  30.  
  31. # 加入名字後就同最上面的 list
  32. names(x) <- c('a', 'b', 'c', 'd')
  33. print(x)
  34.  
  35. # 以下建立的 list 同最上面
  36. x <- list()
  37. x$a <- 1
  38. x['b'] <- T
  39. x[3] <- 'test'
  40. names(x)[3] = 'c'
  41. x$d <- c(1, 2, 3)

得值
  1. x <- list(a=1, b=TRUE, c="test", d=c(1, 2, 3))
  2. # $a
  3. # [1] 1
  4. #
  5. # $b
  6. # [1] TRUE
  7. #
  8. # $c
  9. # [1] "test"
  10. #
  11. # $d
  12. # [1] 1 2 3
  13.  
  14. # 若單純指定 index 如下,得到的只是 list class,非是內含物
  15. x['d']
  16. x[4]
  17. class(x['d']) # [1] "list"
  18. # 所以會出錯,無法得到值
  19. x[4][2]
  20. # $<NA>
  21. # NULL
  22.  
  23. # 得到內含物
  24. x[['d']]
  25. x[[4]]
  26. x$d
  27. class(x$d) # [1] "numeric"
  28. # 可正確得到值
  29. x$d[2]
  30. # [1] 2
  31.  
  32. # list 加入 m
  33. m = matrix(0, nrow=2, ncol=2)
  34. x$m = m
  35. x
  36. # $a
  37. # [1] 1
  38. #
  39. # $b
  40. # [1] TRUE
  41. #
  42. # $c
  43. # [1] "test"
  44. #
  45. # $d
  46. # [1] 1 2 3
  47. #
  48. # $m
  49. # [,1] [,2]
  50. # [1,] 0 0
  51. # [2,] 0 0

Data frames

data.frame 類似資料表,常當作大量資料集,例如:匯入外部檔或讀取資料庫資料等
建立時若為 vector 長度需一致,若為 matrix row 長度需一致

宣告
  1. # name 長度與其他不一致,會出錯
  2. name <- c("Joe", "Bob")
  3. age <- c("28", "26", "34")
  4. gender <- c("Male","Male","Female")
  5. data.frame(name, age, gender)
  6. # 修正如下
  7. name <- c("Joe", "Bob", "Vicky")
  8. data.frame(name, age, gender)
  9. # name age gender
  10. # 1 Joe 28 Male
  11. # 2 Bob 26 Male
  12. # 3 Vicky 34 Female
  13.  
  14. # row size 不一致,會出錯
  15. data.frame(array(0, c(4,3)), array(0, c(3,2)))
  16. # 修正如下
  17. data.frame(array(0, c(3,3)), array(0, c(3,2)))
  18. # X1 X2 X3 X1.1 X2.1
  19. # 1 0 0 0 0 0
  20. # 2 0 0 0 0 0
  21. # 3 0 0 0 0 0
  22.  
  23. df <- data.frame(character(3))
  24. df[1] <- c("Joe", "Bob", "Vicky")
  25. df[2] <- c("28", "26", "34")
  26. df[3] <- c("Male","Male","Female")
  27. colnames(df) <- c('name', 'age', 'gender')
  28. # 同最上面的資料
  29. print(df)
  30. # name age gender
  31. # 1 Joe 28 Male
  32. # 2 Bob 26 Male
  33. # 3 Vicky 34 Female
  34. rownames(df) <- c('第一人', '第二人', '第三人')
  35. print(df)
  36. # name age gender
  37. # 第一人 Joe 28 Male
  38. # 第二人 Bob 26 Male
  39. # 第三人 Vicky 34 Female
  40.  
  41. # 顯示資料基本資訊
  42. summary(df)
  43. # name age gender
  44. # Bob :1 26:1 Female:1
  45. # Joe :1 28:1 Male :2
  46. # Vicky:1 34:1

得值
  1. name <- c("Joe", "Bob", "Vicky")
  2. age <- c("28", "26", "34")
  3. gender <- c("Male","Male","Female")
  4. df <- data.frame(name, age, gender)
  5. # name age gender
  6. # 1 Joe 28 Male
  7. # 2 Bob 26 Male
  8. # 3 Vicky 34 Female
  9.  
  10. df[1,1]
  11. # [1] "Joe"
  12.  
  13. df[1,]
  14. # name age gender
  15. # 1 Joe 28 Male
  16.  
  17. df[,1]
  18. df[,'name']
  19. df$name
  20. # [1] "Joe" "Bob" "Vicky"

參考

R Basic
Data Types

留言