在数据科学的世界里,字符串操作是一项基础而关键的技能。无论是在Python还是R语言中,处理文本数据的能力都是必不可少的。从简单的数据清洗到复杂的文本分析,掌握字符串操作方法可以大大提高我们的工作效率。本文将介绍Python和R中一些常见的字符串操作方法,并展示如何在这两种流行的编程语言中实现它们。
连接字符串
library(tidyverse)
library(stringr)
library(stringi)
str1 = "Hello, "
str2 = "World!"
# 合并两个字符串
str_c(str1,str2,sep = '_')
#合并列表字符
str_c(letters, collapse = ", ")
[1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
str1 = "Hello, "
str2 = "World!"
str1 + str2 # 使用 + 操作符连接
l = ['a','b','c','d']
','.join(l) # 使用 join 方法连接
字符串切片
s = "Hello, World!Hello, World!"
str_sub(s,start = 8,end = 12)
s = "Hello, World!Hello, World!"
sliced_str = s[7:12]
sliced_str
大小写转换
[1] "HELLO, WORLD!HELLO, WORLD!"
[1] "hello, world!hello, world!"
在R中可以使用janitor
包将数据框标题清洗为美观的格式
library(janitor)
x <- data.frame(caseID = 1, DOB = 2, Other = 3)
clean_names(x)
case_id dob other
1 1 2 3
'HELLO, WORLD!HELLO, WORLD!'
'hello, world!hello, world!'
字符串查找
str_locate(s,pattern = 'World')
字符串分割
str_split(s,pattern = ',',simplify = T)
[,1] [,2] [,3]
[1,] "Hello" " World!Hello" " World!"
str_split_i(s,pattern = ',',i = 2) #分割并取出指定位置的元素
s.split(',',-1)# -1代表分割所有
['Hello', ' World!Hello', ' World!']
字符格式化
value = 0.25
str_glue("value is {value}")
# scales包提供了丰富的格式化方法
str_glue("value:{scales::percent(value)}")
now = lubridate::now()
dformat = scales::date_format(format = '%Y-%m-%d %H:%M:%S',tz = "")
str_glue("now is {dformat(now)}")
now is 2024-08-11 14:43:56
用f-string
方法
value = 0.25
formatted_percentage = f"value is {value:.0%}"
formatted_percentage
from datetime import datetime
now = datetime.now()
f"now is {now.strftime('%Y-%m-%d %H:%M:%S')}"
'now is 2024-08-11 14:43:56'
去除空白
str_with_spaces = ' Hello, World! '
stri_trim_both(str_with_spaces) # 去除两端空白
stri_trim_left(str_with_spaces)# 去除左侧空白
stri_trim_right(str_with_spaces)# 去除右侧空白
str_with_spaces = ' Hello, World! '
str_with_spaces.strip() # 去除两端空白
str_with_spaces.lstrip() # 去除左侧空白
str_with_spaces.rstrip() # 去除右侧空白
检查子串
str_detect(s,pattern = 'World')
字节编码
将字符串编码为字节串及将字节串解码为字符串
[1] "Hello, World!Hello, World!"
stri_encode(s,to = 'utf-8')
[1] "Hello, World!Hello, World!"
s_raw = charToRaw(s)
s_raw
[1] 48 65 6c 6c 6f 2c 20 57 6f 72 6c 64 21 48 65 6c 6c 6f 2c 20 57 6f 72 6c 64
[26] 21
[1] "Hello, World!Hello, World!"
encoded_str = s.encode('utf-8') # 将字符串编码为字节串
encoded_str
b'Hello, World!Hello, World!'
decoded_str = encoded_str.decode('utf-8') # 将字节串解码为字符串
decoded_str
'Hello, World!Hello, World!'
# 在字符串前加b可以直接编码为字节串
b'Hello, World!Hello, World!'
b'Hello, World!Hello, World!'
加解密
library(digest)
library(openssl)
str_encrypt <- function(content, key, iv) {
# 将待加密的字符串转换为原始字节
content_bytes <- charToRaw(content)
# 将密钥和IV转换为原始字节
key_bytes <- charToRaw(key)
iv_bytes <- charToRaw(iv)
# 使用相同的密钥和IV创建AES加密对象
aes <- digest::AES(key_bytes, mode="CFB", iv_bytes)
# 对原始字节进行加密
encrypted_bytes <- aes$encrypt(content_bytes,padding = F)
# 对加密后的字节进行Base64编码
encrypted_base64 <- openssl::base64_encode(encrypted_bytes)
# 返回加密后的字符串
return(encrypted_base64)
}
str_decrypt <- function(content,key,iv) {
encrypted_bytes <- openssl::base64_decode(content)
key_bytes = charToRaw(key)
iv_bytes = charToRaw(iv)
aes <- digest::AES(key_bytes, mode="CFB", iv_bytes,padding = F)
decrypted = aes$decrypt(encrypted_bytes)
return(decrypted)
}
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from base64 import b64encode, b64decode
from Crypto.Random import get_random_bytes
import re
def aes_encrypt_string(plaintext, key, iv, mode=AES.MODE_CFB, segment_size=128, charset='utf-8'):
# 将明文转换为字节
plaintext_bytes = plaintext.encode(charset)
# 使用pad函数对明文进行填充,确保其长度是AES块大小的整数倍
padded_plaintext = pad(plaintext_bytes, AES.block_size)
# 将key和iv转换为字节
key_bytes = key.encode(charset)
iv_bytes = iv.encode(charset)
# 创建一个新的AES加密对象
cipher = AES.new(key_bytes, mode, iv_bytes, segment_size=segment_size)
# 加密填充后的明文
encrypted_bytes = cipher.encrypt(padded_plaintext)
# 使用base64编码加密后的字节,以便于存储和传输
encrypted_string = b64encode(encrypted_bytes).decode(charset)
return encrypted_string
def aes_decrypt_string(content,key,iv,mode=AES.MODE_CFB,segment_size=128,charset='utf-8'):
encrypted_bytes = b64decode(content)
key_bytes = key.encode(charset)
iv_bytes = iv.encode(charset)
cipher = AES.new(key_bytes, mode, iv_bytes, segment_size=segment_size)
decrypt = unpad(cipher.decrypt(encrypted_bytes), AES.block_size).decode(charset)
return decrypt
回到顶部