返回

正则表达式教程:匹配中文/汉字

正则表达式

一、正则解释

给定的正则表达式:

/^(?:[\u3400-\u4DB5\u4E00-\u9FEA\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0])$/

含义:

  • ^:匹配字符串的开头。
  • (?: ... ):非捕获组,不捕获匹配内容。
  • [\u3400-\u4DB5\u4E00-\u9FEA\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]:匹配单字的中文字符。
  • [\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]:匹配双字节的中文字符。
  • \uD869[\uDC00-\uDED6\uDF00-\uDFFF]:匹配 Unicode 扩展 A 中的双字节中文字符。
  • \uD86D[\uDC00-\uDF34\uDF40-\uDFFF]:匹配 Unicode 扩展 B 中的双字节中文字符。
  • \uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]:匹配 Unicode 扩展 C 中的双字节中文字符。
  • \uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]:匹配 Unicode 扩展 D 中的双字节中文字符。
  • \uD87A[\uDC00-\uDFE0]:匹配 Unicode 扩展 E 中的双字节中文字符。
  • $:匹配字符串的结尾。

二、使用场景

此正则表达式主要用于匹配中文或汉字,常见场景有:

  • 验证输入的姓名、地址或其他需要包含中文信息的字段。
  • 从文本中提取中文部分。
  • 过滤掉非中文内容。

三、代码示例

JavaScript

const regex = /^(?:[\u3400-\u4DB5\u4E00-\u9FEA\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0])$/;

const input = "正则";
const result = regex.test(input);

if (result) {
  console.log("匹配成功");
} else {
  console.log("匹配失败");
}

Java

import java.util.regex.Pattern;

public class ChineseMatcher {

  private static final String CHINESE_PATTERN = "^(?:[\u3400-\u4DB5\u4E00-\u9FEA\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0])
import java.util.regex.Pattern;

public class ChineseMatcher {

  private static final String CHINESE_PATTERN = "^(?:[\u3400-\u4DB5\u4E00-\u9FEA\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0])$";

  public static boolean isChinese(String input) {
    return Pattern.matches(CHINESE_PATTERN, input);
  }

  public static void main(String[] args) {
    String input = "正则";
    boolean result = isChinese(input);

    System.out.println(result ? "匹配成功" : "匹配失败");
  }
}
quot;
; public static boolean isChinese(String input) { return Pattern.matches(CHINESE_PATTERN, input); } public static void main(String[] args) { String input = "正则"; boolean result = isChinese(input); System.out.println(result ? "匹配成功" : "匹配失败"); } }

PHP

<?php

$pattern = '/^(?:[\u3400-\u4DB5\u4E00-\u9FEA\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0])$/';

$input = "正则";
$result = preg_match($pattern, $input);

if ($result) {
  echo "匹配成功";
} else {
  echo "匹配失败";
}

Python

import re

pattern = r'^(?:[\u3400-\u4DB5\u4E00-\u9FEA\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0])
import re

pattern = r'^(?:[\u3400-\u4DB5\u4E00-\u9FEA\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29]|[\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0])$'

input = "正则"
result = re.match(pattern, input)

if result:
#x27; input = "正则" result = re.match(pattern, input) if result: