採用 w3c FAQ 的 regular expression 檢驗方式
#!/usr/bin/perl $a="測試"; $b="yyy"; $ca=is_utf8($a)?"UTF-8":"ASCII"; $cb=is_utf8($b)?"UTF-8":"ASCII"; print("[$a] : $ca\n"); print("[$b] : $cb\n"); exit; # 判別是否 UTF-8 字串 sub is_utf8 { local($p_string) = @_; #From http://w3.org/International/questions/qa-forms-utf-8.html # It will return true if $p_string is UTF-8, and false otherwise. return($p_string =~ m/\A( [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*\z/x); }
[測試] : UTF-8 [yyy] : UTF-8
<?php $a="測試"; $b="yyy"; $ca=is_utf8($a)?"UTF-8":"ASCII"; $cb=is_utf8($b)?"UTF-8":"ASCII"; echo("[$a] : $ca\n"); echo("[$b] : $cb\n"); function is_utf8($string) { // From http://w3.org/International/questions/qa-forms-utf-8.html return preg_match('%^(?: [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*$%xs', $string); } // function is_utf8 ?>
[測試] : UTF-8 [yyy] : UTF-8
[apache@tryboxap04 tmp]$ iconv -f utf8 -t big5 tx.pl >txx.pl [apache@tryboxap04 tmp]$ perl txx.pl [測試] : ASCII [yyy] : UTF-8
[apache@tryboxap04 input]$ file 20080415-2.csv 20080415-2.csv: UTF-8 Unicode text, with CRLF line terminators [apache@tryboxap04 input]$ file 20080415-2.csv.md5 20080415-2.csv.md5: ASCII text [apache@tryboxap04 tmp]$ file 20080415-2.csv 20080415-2.csv: ISO-8859 text, with CRLF line terminators