字符串操作

R语言

python

文本处理

常用的字符串操作方法

作者

不止BI

发布于

2024年2月6日

在数据科学的世界里，字符串操作是一项基础而关键的技能。无论是在Python还是R语言中，处理文本数据的能力都是必不可少的。从简单的数据清洗到复杂的文本分析，掌握字符串操作方法可以大大提高我们的工作效率。本文将介绍Python和R中一些常见的字符串操作方法，并展示如何在这两种流行的编程语言中实现它们。

连接字符串

R
Python

library(tidyverse)
library(stringr)
library(stringi)
str1 = "Hello, "
str2 = "World!"
# 合并两个字符串
str_c(str1,str2,sep = '_')

[1] "Hello, _World!"

#合并列表字符
str_c(letters, collapse = ", ")

[1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"

str1 = "Hello, "
str2 = "World!"
str1 + str2  # 使用 + 操作符连接

'Hello, World!'

l = ['a','b','c','d']
','.join(l) # 使用 join 方法连接

'a,b,c,d'

字符串切片

R
Python

s = "Hello, World!Hello, World!"
str_sub(s,start = 8,end = 12)

[1] "World"

s = "Hello, World!Hello, World!"
sliced_str = s[7:12]  
sliced_str

'World'

字符串长度

R
Python

str_length(s)

[1] 26

len(s)

大小写转换

R
Python

str_to_upper(s)

[1] "HELLO, WORLD!HELLO, WORLD!"

str_to_lower(s)

[1] "hello, world!hello, world!"

在R中可以使用janitor包将数据框标题清洗为美观的格式

library(janitor)
x <- data.frame(caseID = 1, DOB = 2, Other = 3)
clean_names(x)

  case_id dob other
1       1   2     3

s.upper()

'HELLO, WORLD!HELLO, WORLD!'

s.lower()

'hello, world!hello, world!'

字符串查找

R
Python

str_locate(s,pattern = 'World')

     start end
[1,]     8  12

s.find('World')

字符串分割

R
Python

str_split(s,pattern = ',',simplify = T)

     [,1]    [,2]           [,3]     
[1,] "Hello" " World!Hello" " World!"

str_split_i(s,pattern = ',',i = 2) #分割并取出指定位置的元素

[1] " World!Hello"

s.split(',',-1)# -1代表分割所有

['Hello', ' World!Hello', ' World!']

字符格式化

R
Python

value = 0.25
str_glue("value is {value}")

value is 0.25

# scales包提供了丰富的格式化方法
str_glue("value:{scales::percent(value)}")

value:25%

now = lubridate::now()
dformat = scales::date_format(format = '%Y-%m-%d %H:%M:%S',tz = "")
str_glue("now is {dformat(now)}")

now is 2024-08-11 14:43:56

用f-string方法

value = 0.25
formatted_percentage = f"value is {value:.0%}"
formatted_percentage

'value is 25%'

from datetime import datetime
now = datetime.now()
f"now is {now.strftime('%Y-%m-%d %H:%M:%S')}"

'now is 2024-08-11 14:43:56'

str_with_spaces = '  Hello, World!  '
stri_trim_both(str_with_spaces) # 去除两端空白

[1] "Hello, World!"

stri_trim_left(str_with_spaces)# 去除左侧空白

[1] "Hello, World!  "

stri_trim_right(str_with_spaces)# 去除右侧空白

[1] "  Hello, World!"

str_with_spaces = '  Hello, World!  '
str_with_spaces.strip()  # 去除两端空白

'Hello, World!'

str_with_spaces.lstrip()  # 去除左侧空白

'Hello, World!  '

str_with_spaces.rstrip()  # 去除右侧空白

'  Hello, World!'

检查子串

R
Python

str_detect(s,pattern = 'World')

[1] TRUE

'World' in s

True

字符串计数

R
Python

str_count(s,"World")

[1] 2

s.count("World")

开头或结尾

R
Python

str_starts(s,'Hello')

[1] TRUE

str_ends(s,"!")

[1] TRUE

s.startswith('Hello')

True

s.endswith('!')

True

字节编码

将字符串编码为字节串及将字节串解码为字符串

R
Python

stri_enc_toutf8(s)

[1] "Hello, World!Hello, World!"

stri_encode(s,to = 'utf-8')

[1] "Hello, World!Hello, World!"

s_raw = charToRaw(s)
s_raw

 [1] 48 65 6c 6c 6f 2c 20 57 6f 72 6c 64 21 48 65 6c 6c 6f 2c 20 57 6f 72 6c 64
[26] 21

rawToChar(s_raw)

[1] "Hello, World!Hello, World!"

encoded_str = s.encode('utf-8')  # 将字符串编码为字节串
encoded_str

b'Hello, World!Hello, World!'

decoded_str = encoded_str.decode('utf-8')  # 将字节串解码为字符串
decoded_str

'Hello, World!Hello, World!'

# 在字符串前加b可以直接编码为字节串
b'Hello, World!Hello, World!'

b'Hello, World!Hello, World!'

加解密

R
Python

library(digest)
library(openssl)

str_encrypt <- function(content, key, iv) {
  # 将待加密的字符串转换为原始字节
  content_bytes <- charToRaw(content)
  
  # 将密钥和IV转换为原始字节
  key_bytes <- charToRaw(key)
  iv_bytes <- charToRaw(iv)
  
  # 使用相同的密钥和IV创建AES加密对象
  aes <- digest::AES(key_bytes, mode="CFB", iv_bytes)
  
  # 对原始字节进行加密
  encrypted_bytes <- aes$encrypt(content_bytes,padding = F)
  
  # 对加密后的字节进行Base64编码
  encrypted_base64 <- openssl::base64_encode(encrypted_bytes)
  
  # 返回加密后的字符串
  return(encrypted_base64)
}

str_decrypt <- function(content,key,iv) {
  encrypted_bytes <- openssl::base64_decode(content)
  key_bytes = charToRaw(key)
  iv_bytes = charToRaw(iv)
  aes <- digest::AES(key_bytes, mode="CFB", iv_bytes,padding = F)
  decrypted = aes$decrypt(encrypted_bytes)
  return(decrypted)
}




from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from base64 import b64encode, b64decode
from Crypto.Random import get_random_bytes
import re


def aes_encrypt_string(plaintext, key, iv, mode=AES.MODE_CFB, segment_size=128, charset='utf-8'):
    
    # 将明文转换为字节
    plaintext_bytes = plaintext.encode(charset)
    
    # 使用pad函数对明文进行填充，确保其长度是AES块大小的整数倍
    padded_plaintext = pad(plaintext_bytes, AES.block_size)
    
    # 将key和iv转换为字节
    key_bytes = key.encode(charset)
    iv_bytes = iv.encode(charset)
    
    # 创建一个新的AES加密对象
    cipher = AES.new(key_bytes, mode, iv_bytes, segment_size=segment_size)
    
    # 加密填充后的明文
    encrypted_bytes = cipher.encrypt(padded_plaintext)
    
    # 使用base64编码加密后的字节，以便于存储和传输
    encrypted_string = b64encode(encrypted_bytes).decode(charset)
    
    return encrypted_string


def aes_decrypt_string(content,key,iv,mode=AES.MODE_CFB,segment_size=128,charset='utf-8'):
  encrypted_bytes = b64decode(content)
  key_bytes = key.encode(charset)
  iv_bytes = iv.encode(charset)
  cipher = AES.new(key_bytes, mode, iv_bytes, segment_size=segment_size)
  decrypt = unpad(cipher.decrypt(encrypted_bytes), AES.block_size).decode(charset)
  return decrypt

回到顶部