字符串操作

R语言
python
文本处理
常用的字符串操作方法
作者

不止BI

发布于

2024年2月6日

在数据科学的世界里,字符串操作是一项基础而关键的技能。无论是在Python还是R语言中,处理文本数据的能力都是必不可少的。从简单的数据清洗到复杂的文本分析,掌握字符串操作方法可以大大提高我们的工作效率。本文将介绍Python和R中一些常见的字符串操作方法,并展示如何在这两种流行的编程语言中实现它们。

连接字符串

library(tidyverse)
library(stringr)
library(stringi)
str1 = "Hello, "
str2 = "World!"
# 合并两个字符串
str_c(str1,str2,sep = '_')
[1] "Hello, _World!"
#合并列表字符
str_c(letters, collapse = ", ")
[1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
str1 = "Hello, "
str2 = "World!"
str1 + str2  # 使用 + 操作符连接
'Hello, World!'
l = ['a','b','c','d']
','.join(l) # 使用 join 方法连接
'a,b,c,d'

字符串切片

s = "Hello, World!Hello, World!"
str_sub(s,start = 8,end = 12)
[1] "World"
s = "Hello, World!Hello, World!"
sliced_str = s[7:12]  
sliced_str
'World'

字符串长度

str_length(s) 
[1] 26
len(s) 
26

大小写转换

str_to_upper(s)
[1] "HELLO, WORLD!HELLO, WORLD!"
str_to_lower(s)
[1] "hello, world!hello, world!"

在R中可以使用janitor包将数据框标题清洗为美观的格式

library(janitor)
x <- data.frame(caseID = 1, DOB = 2, Other = 3)
clean_names(x)
  case_id dob other
1       1   2     3
s.upper() 
'HELLO, WORLD!HELLO, WORLD!'
s.lower() 
'hello, world!hello, world!'

字符串查找

str_locate(s,pattern = 'World')
     start end
[1,]     8  12
s.find('World') 
7

字符串分割

str_split(s,pattern = ',',simplify = T)
     [,1]    [,2]           [,3]     
[1,] "Hello" " World!Hello" " World!"
str_split_i(s,pattern = ',',i = 2) #分割并取出指定位置的元素
[1] " World!Hello"
s.split(',',-1)# -1代表分割所有
['Hello', ' World!Hello', ' World!']

字符格式化

value = 0.25
str_glue("value is {value}")
value is 0.25
# scales包提供了丰富的格式化方法
str_glue("value:{scales::percent(value)}")
value:25%
now = lubridate::now()
dformat = scales::date_format(format = '%Y-%m-%d %H:%M:%S',tz = "")
str_glue("now is {dformat(now)}")
now is 2024-08-11 14:43:56

f-string方法

value = 0.25
formatted_percentage = f"value is {value:.0%}"
formatted_percentage
'value is 25%'
from datetime import datetime
now = datetime.now()
f"now is {now.strftime('%Y-%m-%d %H:%M:%S')}"
'now is 2024-08-11 14:43:56'

去除空白

str_with_spaces = '  Hello, World!  '
stri_trim_both(str_with_spaces) # 去除两端空白
[1] "Hello, World!"
stri_trim_left(str_with_spaces)# 去除左侧空白
[1] "Hello, World!  "
stri_trim_right(str_with_spaces)# 去除右侧空白
[1] "  Hello, World!"
str_with_spaces = '  Hello, World!  '
str_with_spaces.strip()  # 去除两端空白
'Hello, World!'
str_with_spaces.lstrip()  # 去除左侧空白
'Hello, World!  '
str_with_spaces.rstrip()  # 去除右侧空白
'  Hello, World!'

检查子串

str_detect(s,pattern = 'World')
[1] TRUE
'World' in s  
True

字符串计数

str_count(s,"World")
[1] 2
s.count("World")
2

开头或结尾

str_starts(s,'Hello')
[1] TRUE
str_ends(s,"!")
[1] TRUE
s.startswith('Hello')
True
s.endswith('!') 
True

字节编码

将字符串编码为字节串及将字节串解码为字符串

stri_enc_toutf8(s)
[1] "Hello, World!Hello, World!"
stri_encode(s,to = 'utf-8')
[1] "Hello, World!Hello, World!"
s_raw = charToRaw(s)
s_raw
 [1] 48 65 6c 6c 6f 2c 20 57 6f 72 6c 64 21 48 65 6c 6c 6f 2c 20 57 6f 72 6c 64
[26] 21
rawToChar(s_raw)
[1] "Hello, World!Hello, World!"
encoded_str = s.encode('utf-8')  # 将字符串编码为字节串
encoded_str
b'Hello, World!Hello, World!'
decoded_str = encoded_str.decode('utf-8')  # 将字节串解码为字符串
decoded_str
'Hello, World!Hello, World!'
# 在字符串前加b可以直接编码为字节串
b'Hello, World!Hello, World!'
b'Hello, World!Hello, World!'

加解密

library(digest)
library(openssl)

str_encrypt <- function(content, key, iv) {
  # 将待加密的字符串转换为原始字节
  content_bytes <- charToRaw(content)
  
  # 将密钥和IV转换为原始字节
  key_bytes <- charToRaw(key)
  iv_bytes <- charToRaw(iv)
  
  # 使用相同的密钥和IV创建AES加密对象
  aes <- digest::AES(key_bytes, mode="CFB", iv_bytes)
  
  # 对原始字节进行加密
  encrypted_bytes <- aes$encrypt(content_bytes,padding = F)
  
  # 对加密后的字节进行Base64编码
  encrypted_base64 <- openssl::base64_encode(encrypted_bytes)
  
  # 返回加密后的字符串
  return(encrypted_base64)
}

str_decrypt <- function(content,key,iv) {
  encrypted_bytes <- openssl::base64_decode(content)
  key_bytes = charToRaw(key)
  iv_bytes = charToRaw(iv)
  aes <- digest::AES(key_bytes, mode="CFB", iv_bytes,padding = F)
  decrypted = aes$decrypt(encrypted_bytes)
  return(decrypted)
}



from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from base64 import b64encode, b64decode
from Crypto.Random import get_random_bytes
import re


def aes_encrypt_string(plaintext, key, iv, mode=AES.MODE_CFB, segment_size=128, charset='utf-8'):
    
    # 将明文转换为字节
    plaintext_bytes = plaintext.encode(charset)
    
    # 使用pad函数对明文进行填充,确保其长度是AES块大小的整数倍
    padded_plaintext = pad(plaintext_bytes, AES.block_size)
    
    # 将key和iv转换为字节
    key_bytes = key.encode(charset)
    iv_bytes = iv.encode(charset)
    
    # 创建一个新的AES加密对象
    cipher = AES.new(key_bytes, mode, iv_bytes, segment_size=segment_size)
    
    # 加密填充后的明文
    encrypted_bytes = cipher.encrypt(padded_plaintext)
    
    # 使用base64编码加密后的字节,以便于存储和传输
    encrypted_string = b64encode(encrypted_bytes).decode(charset)
    
    return encrypted_string


def aes_decrypt_string(content,key,iv,mode=AES.MODE_CFB,segment_size=128,charset='utf-8'):
  encrypted_bytes = b64decode(content)
  key_bytes = key.encode(charset)
  iv_bytes = iv.encode(charset)
  cipher = AES.new(key_bytes, mode, iv_bytes, segment_size=segment_size)
  decrypt = unpad(cipher.decrypt(encrypted_bytes), AES.block_size).decode(charset)
  return decrypt
回到顶部