[R] 基本資料架構

程式語言:R
官網
官方文件

簡介:基本資料架構

基本

  • 所有 index 皆是從 1 開始,而不是 0
  • class(x) 
    • 判斷 x 架構,像是 matrix
  • mode(x) 
    • 判斷 x 內含的資料屬性,像是 numeric
  • typeof(x)
    • 判斷更精確的 x 內含資料屬性,像是 double
  • attributes(x)
    • 得到 x 的屬性,像是 dim

資料屬性

  • character
    • 字串
    • class("test")
  • numeric
    • 實數
    • R 計算上皆是實數計算,除非特別轉換
    • class(10)
  • integer
    • 整數
    • class(1:10) 
    • class(as.integer(10))
  • complex
    • 複數
    • class(2+2i)
  • logical
    • True 或 False
    • class(T)
  • raw
    • bytes
    • class(charToRaw('A'))

Assignment

# 正常做法
x <- c(1,2,3)
# 反著寫也可以
c(1,2,3) -> x
# 用 function
assign("x", c(1,2,3))
# 大部分情況等同 ->,但不建議使用
x = c(1,2,3)

Vector

vector 必定是同個資料屬性,會自動轉換,依以下強弱順序
character > complex > numeric > integer > logical

宣告
# by c()
v <- c(1, 3, 5)
# by seq
v <- seq(1, 5, by=2)
# by :
v <- 5:1
# by rep
v <- rep(1, times=5)
# boolean
v <- c(1, 3, 5) > 3
# is.na
v <- is.na(c(1:3,NA))
# character
v <- paste(c('x', 'y'), 1:5, sep="")

得值
# init
v <- c(seq(1, 10, by=2), NA)
vLen = length(v)
vNames <- as.raw(0x41:(0x41+vLen-1))
vNames <- rawToChar(vNames)
vNames <- strsplit(vNames, split='')
vNames <- unlist(vNames)
names(v) <- vNames
print(v)
# A  B  C  D  E  F 
# 1  3  5  7  9 NA 

# index 從 1 開始,而不是 0
v[1:10]
# A    B    C    D    E    F <NA> <NA> <NA> <NA> 
# 1    3    5    7    9   NA   NA   NA   NA   NA

v[!is.na(v) & v>5]
# D E 
# 7 9 

# 排除前兩項
v[-(1:2)]
# C  D  E  F 
# 5  7  9 NA

v[c('A', 'D')]
# A D 
# 1 7

# 將大於 3 的值設為負的
index <- !is.na(v) & v>3
v[index] = -v[index]
print(v)
# A  B  C  D  E  F 
# 1  3 -5 -7 -9 NA 

Array


宣告
# 將 z 轉換為三維陣列,大小為 2x3x4
z <- 1:24
dim(z) <- c(2,3,4)

# 初始值為 0 的 2x3 陣列
array(0, c(2,3))

得值
# 建立三維陣列,大小為 2x3x4
z <- array(1:24, dim=c(2,3,4))
# , , 1
# 
#      [,1] [,2] [,3]
# [1,]    1    3    5
# [2,]    2    4    6
# 
# , , 2
# 
#      [,1] [,2] [,3]
# [1,]    7    9   11
# [2,]    8   10   12
# 
# , , 3
# 
#      [,1] [,2] [,3]
# [1,]   13   15   17
# [2,]   14   16   18
# 
# , , 4
# 
#      [,1] [,2] [,3]
# [1,]   19   21   23
# [2,]   20   22   24

# 得到位罝為 (1,2,3) 的值
z[1,2,3]
# [1] 15

# 得到所有 x 位罝為 2 的值,大小為 3x4
z[2,,]
#      [,1] [,2] [,3] [,4]
# [1,]    2    8   14   20
# [2,]    4   10   16   22
# [3,]    6   12   18   24

Matrix

二維 array 即是 matrix,class 皆為 matrix

宣告
a <- array(0, c(2,3))
#      [,1] [,2] [,3]
# [1,]    0    0    0
# [2,]    0    0    0

m <- matrix(0,nrow=2, ncol=3)
#      [,1] [,2] [,3]
# [1,]    0    0    0
# [2,]    0    0    0

class(a) == class(m)
# [1] TRUE

matrix(c(1:4), nrow=2, ncol=2)
#      [,1] [,2]
# [1,]    1    3
# [2,]    2    4

# 可以更改成按照 row 填入資料
matrix(c(1:4), nrow=2, ncol=2, byrow=TRUE) 
#      [,1] [,2]
# [1,]    1    2
# [2,]    3    4

matrix(c(1:4), nrow=2, ncol=2, dimnames=list(c('A', 'B'),c('a', 'b')))
#   a b
# A 1 3
# B 2 4

# 同上
m2 <- matrix(c(1:4), nrow=2, ncol=2)
rownames(m2) <- c('A', 'B')
colnames(m2) <- c('a', 'b')
print(m2)
#   a b
# A 1 3
# B 2 4

得值
m = matrix(c(1:6), nrow=2, ncol=3, dimnames=list(c('A', 'B'), c('a', 'b', 'c')), byrow=T)
#   a b c
# A 1 2 3
# B 4 5 6

# 得 a 行
m[,'a']
# A B 
# 1 4

# 得 A 欄
m[1,]
# a b c 
# 1 2 3 

# 多加一欄
rbind(m, C=7:9)
#   a b c
# A 1 2 3
# B 4 5 6
# C 7 8 9

# 多加一行
cbind(m, 7:8)
#   a b c  
# A 1 2 3 7
# B 4 5 6 8

Factors

資料分類用

宣告
# 無序
survey_vector <- c("M", "F", "F", "M", "M")
factor_survey_vector <- factor(survey_vector)
factor_survey_vector
# [1] M F F M M
# Levels: F M

# 因無序,比較無意義
factor_survey_vector[1] > factor_survey_vector[2]
# [1] NA
# Warning message:
#     In Ops.factor(factor_survey_vector[1], factor_survey_vector[2]) :
#     ‘>’ not meaningful for factors

# 有序,因溫度有高下之分
temperature_vector <- c("High", "Low", "High","Low", "Medium")
factor_temperature_vector <- factor(temperature_vector, order=TRUE, levels=c("Low", "Medium", "High"))
factor_temperature_vector
# [1] High   Low    High   Low    Medium
# Levels: Low < Medium < High

# 因有序,比較才有意義
factor_temperature_vector[1] > factor_temperature_vector[2]
# [1] TRUE

運用
survey_vector <- c("M", "F", "F", "M", "M")
factor_survey_vector <- factor(survey_vector)
factor_survey_vector
# [1] M F F M M
# Levels: F M

# 更改值
levels(factor_survey_vector) <- c("Female", "Male")

factor_survey_vector
# [1] Male Female Female Male Male  
# Levels: Female Male

# 得到第五個值
factor_survey_vector[5]
# [1] Male
# Levels: Female Male

summary(factor_survey_vector)
# Female   Male 
# 2      3 
# tapply 用法
incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
             61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
             59, 46, 58, 43)

state <- c("tas", "sa",  "qld", "nsw", "nsw", "nt",  "wa",  "wa",
           "qld", "vic", "nsw", "vic", "qld", "qld", "sa",  "tas",
           "sa",  "nt",  "wa",  "vic", "qld", "nsw", "nsw", "wa",
           "sa",  "act", "nsw", "vic", "vic", "act")

statef <- factor(state)
# [1] tas sa  qld nsw nsw nt  wa  wa  qld vic nsw vic qld qld sa  tas sa  nt  wa 
# [20] vic qld nsw nsw wa  sa  act nsw vic vic act
# Levels: act nsw nt qld sa tas vic wa

levels(statef)
# [1] "act" "nsw" "nt"  "qld" "sa"  "tas" "vic" "wa" 

# 算出每個 group 的 mean
incmeans <- tapply(incomes, statef, mean)
# act      nsw       nt      qld       sa      tas      vic       wa 
# 44.50000 57.33333 55.50000 53.60000 55.00000 60.50000 56.00000 52.25000

Lists

list 可以包含不同資料屬性的資料

宣告
v = c(1, 2, 3)
x <- list(a=1, b=TRUE, c="test", d=v)
# $a
# [1] 1
# 
# $b
# [1] TRUE
# 
# $c
# [1] "test"
# 
# $d
# [1] 1 2 3

a=1; b=TRUE; c="test"; d=c(1, 2, 3)
x <- list(a, b, c, d)
# 可以看到無名字,例 [[1]],因為不是直接用 = 的方法
print(x)
# [[1]]
# [1] 1
# 
# [[2]]
# [1] TRUE
# 
# [[3]]
# [1] "test"
# 
# [[4]]
# [1] 1 2 3

# 加入名字後就同最上面的 list
names(x) <- c('a', 'b', 'c', 'd')
print(x)

# 以下建立的 list 同最上面
x <- list()
x$a <- 1
x['b'] <- T
x[3] <- 'test'
names(x)[3] = 'c'
x$d <- c(1, 2, 3)

得值
x <- list(a=1, b=TRUE, c="test", d=c(1, 2, 3))
# $a
# [1] 1
# 
# $b
# [1] TRUE
# 
# $c
# [1] "test"
# 
# $d
# [1] 1 2 3

# 若單純指定 index 如下,得到的只是 list class,非是內含物
x['d']
x[4]
class(x['d']) # [1] "list"
# 所以會出錯,無法得到值
x[4][2]
# $<NA>
# NULL

# 得到內含物
x[['d']]
x[[4]]
x$d
class(x$d) # [1] "numeric"
# 可正確得到值
x$d[2]
# [1] 2

# list 加入 m
m = matrix(0, nrow=2, ncol=2)
x$m = m
x
# $a
# [1] 1
# 
# $b
# [1] TRUE
# 
# $c
# [1] "test"
# 
# $d
# [1] 1 2 3
# 
# $m
#      [,1] [,2]
# [1,]    0    0
# [2,]    0    0

Data frames

data.frame 類似資料表,常當作大量資料集,例如:匯入外部檔或讀取資料庫資料等
建立時若為 vector 長度需一致,若為 matrix row 長度需一致

宣告
# name 長度與其他不一致,會出錯
name <- c("Joe", "Bob")
age <- c("28", "26", "34")
gender <- c("Male","Male","Female")
data.frame(name, age, gender)
# 修正如下
name <- c("Joe", "Bob", "Vicky")
data.frame(name, age, gender)
#    name age gender
# 1   Joe  28   Male
# 2   Bob  26   Male
# 3 Vicky  34 Female

# row size 不一致,會出錯
data.frame(array(0, c(4,3)), array(0, c(3,2)))
# 修正如下
data.frame(array(0, c(3,3)), array(0, c(3,2)))
#   X1 X2 X3 X1.1 X2.1
# 1  0  0  0    0    0
# 2  0  0  0    0    0
# 3  0  0  0    0    0

df <- data.frame(character(3))
df[1] <- c("Joe", "Bob", "Vicky")
df[2] <- c("28", "26", "34")
df[3] <- c("Male","Male","Female")
colnames(df) <- c('name', 'age', 'gender')
# 同最上面的資料
print(df)
#    name age gender
# 1   Joe  28   Male
# 2   Bob  26   Male
# 3 Vicky  34 Female
rownames(df) <- c('第一人', '第二人', '第三人')
print(df)
#         name age gender
# 第一人   Joe  28   Male
# 第二人   Bob  26   Male
# 第三人 Vicky  34 Female

# 顯示資料基本資訊
summary(df)
# name   age       gender 
# Bob  :1   26:1   Female:1  
# Joe  :1   28:1   Male  :2  
# Vicky:1   34:1  

得值
name <- c("Joe", "Bob", "Vicky")
age <- c("28", "26", "34")
gender <- c("Male","Male","Female")
df <- data.frame(name, age, gender)
#    name age gender
# 1   Joe  28   Male
# 2   Bob  26   Male
# 3 Vicky  34 Female

df[1,1]
# [1] "Joe"

df[1,]
#   name age gender
# 1  Joe  28   Male

df[,1]
df[,'name']
df$name
# [1] "Joe"   "Bob"   "Vicky"

參考

R Basic
Data Types

留言