在数据科学的世界里,字符串操作是一项基础而关键的技能。无论是在Python还是R语言中,处理文本数据的能力都是必不可少的。从简单的数据清洗到复杂的文本分析,掌握字符串操作方法可以大大提高我们的工作效率。本文将介绍Python和R中一些常见的字符串操作方法,并展示如何在这两种流行的编程语言中实现它们。
连接字符串
library(tidyverse)
library(stringr)
library(stringi)
str1 <- "Hello, "
str2 <- "World!"
# 合并两个字符串
str_c(str1, str2, sep = "_")
# 合并列表字符
str_c(letters, collapse = ", ")
[1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
str1 = "Hello, "
str2 = "World!"
str1 + str2 # 使用 + 操作符连接
l = ['a','b','c','d']
','.join(l) # 使用 join 方法连接
字符串切片
s <- "Hello, World!Hello, World!"
str_sub(s, start = 8, end = 12)
s = "Hello, World!Hello, World!"
sliced_str = s[7:12]
sliced_str
大小写转换
[1] "HELLO, WORLD!HELLO, WORLD!"
[1] "hello, world!hello, world!"
在R中可以使用janitor包将数据框标题清洗为美观的格式
library(janitor)
x <- data.frame(caseID = 1, DOB = 2, Other = 3)
clean_names(x)
case_id dob other
1 1 2 3
'HELLO, WORLD!HELLO, WORLD!'
'hello, world!hello, world!'
字符串查找
str_locate(s, pattern = "World")
字符串分割
str_split(s, pattern = ",", simplify = T)
[,1] [,2] [,3]
[1,] "Hello" " World!Hello" " World!"
str_split_i(s, pattern = ",", i = 2) # 分割并取出指定位置的元素
s.split(',',-1)# -1代表分割所有
['Hello', ' World!Hello', ' World!']
字符格式化
value <- 0.25
str_glue("value is {value}")
# scales包提供了丰富的格式化方法
str_glue("value:{scales::percent(value)}")
now <- lubridate::now()
dformat <- scales::date_format(format = "%Y-%m-%d %H:%M:%S", tz = "")
str_glue("now is {dformat(now)}")
now is 2025-04-12 19:39:48
用f-string方法
value = 0.25
formatted_percentage = f"value is {value:.0%}"
formatted_percentage
from datetime import datetime
now = datetime.now()
f"now is {now.strftime('%Y-%m-%d %H:%M:%S')}"
'now is 2025-04-12 19:39:48'
去除空白
str_with_spaces <- " Hello, World! "
stri_trim_both(str_with_spaces) # 去除两端空白
stri_trim_left(str_with_spaces) # 去除左侧空白
stri_trim_right(str_with_spaces) # 去除右侧空白
str_with_spaces = ' Hello, World! '
str_with_spaces.strip() # 去除两端空白
str_with_spaces.lstrip() # 去除左侧空白
str_with_spaces.rstrip() # 去除右侧空白
检查子串
str_detect(s, pattern = "World")
字节编码
将字符串编码为字节串及将字节串解码为字符串
[1] "Hello, World!Hello, World!"
stri_encode(s, to = "utf-8")
[1] "Hello, World!Hello, World!"
s_raw <- charToRaw(s)
s_raw
[1] 48 65 6c 6c 6f 2c 20 57 6f 72 6c 64 21 48 65 6c 6c 6f 2c 20 57 6f 72 6c 64
[26] 21
[1] "Hello, World!Hello, World!"
encoded_str = s.encode('utf-8') # 将字符串编码为字节串
encoded_str
b'Hello, World!Hello, World!'
decoded_str = encoded_str.decode('utf-8') # 将字节串解码为字符串
decoded_str
'Hello, World!Hello, World!'
# 在字符串前加b可以直接编码为字节串
b'Hello, World!Hello, World!'
b'Hello, World!Hello, World!'
加解密
library(digest)
library(openssl)
str_encrypt <- function(content, key, iv) {
# 将待加密的字符串转换为原始字节
content_bytes <- charToRaw(content)
# 将密钥和IV转换为原始字节
key_bytes <- charToRaw(key)
iv_bytes <- charToRaw(iv)
# 使用相同的密钥和IV创建AES加密对象
aes <- digest::AES(key_bytes, mode = "CFB", iv_bytes)
# 对原始字节进行加密
encrypted_bytes <- aes$encrypt(content_bytes, padding = F)
# 对加密后的字节进行Base64编码
encrypted_base64 <- openssl::base64_encode(encrypted_bytes)
# 返回加密后的字符串
return(encrypted_base64)
}
str_decrypt <- function(content, key, iv) {
encrypted_bytes <- openssl::base64_decode(content)
key_bytes <- charToRaw(key)
iv_bytes <- charToRaw(iv)
aes <- digest::AES(key_bytes, mode = "CFB", iv_bytes, padding = F)
decrypted <- aes$decrypt(encrypted_bytes)
return(decrypted)
}
# from Crypto.Cipher import AES
# from Crypto.Util.Padding import pad, unpad
# from base64 import b64encode, b64decode
# from Crypto.Random import get_random_bytes
# import re
# def aes_encrypt_string(
# plaintext, key, iv, mode=AES.MODE_CFB, segment_size=128, charset="utf-8"
# ):
# # 将明文转换为字节
# plaintext_bytes = plaintext.encode(charset)
# # 使用pad函数对明文进行填充,确保其长度是AES块大小的整数倍
# padded_plaintext = pad(plaintext_bytes, AES.block_size)
# # 将key和iv转换为字节
# key_bytes = key.encode(charset)
# iv_bytes = iv.encode(charset)
# # 创建一个新的AES加密对象
# cipher = AES.new(key_bytes, mode, iv_bytes, segment_size=segment_size)
# # 加密填充后的明文
# encrypted_bytes = cipher.encrypt(padded_plaintext)
# # 使用base64编码加密后的字节,以便于存储和传输
# encrypted_string = b64encode(encrypted_bytes).decode(charset)
# return encrypted_string
# def aes_decrypt_string(
# content, key, iv, mode=AES.MODE_CFB, segment_size=128, charset="utf-8"
# ):
# encrypted_bytes = b64decode(content)
# key_bytes = key.encode(charset)
# iv_bytes = iv.encode(charset)
# cipher = AES.new(key_bytes, mode, iv_bytes, segment_size=segment_size)
# decrypt = unpad(cipher.decrypt(encrypted_bytes), AES.block_size).decode(charset)
# return decrypt
回到顶部