Every php developer who work in japan realized that mb_* function don’t work that good with SJIS encoded string. Some characters are considered as 1 byte length, or as 2 bytes and some as 3 bytes |-| ; Not really useful in the end. i recently discovered this article (in Japanese : http://phpspot.org/blog/archives/2005/11/php_17.html). The presented regEx is supposed to split a Japanese string into word. Linguistic-wise, I’m completely disagree with the results, but it was an interesting starting point for Japanese string manipulation and specially SJIS encoded ones :> .
then i changed a little bit the regEX in order to obtain a one-character tokenizer one. This regEx enable me afterward to re implement the basic but really useful string manipulation command as substring, str_replace, strpos, strlen etc…
here is the code of the class
//implementtation of the most basic string manipulation function for japanese, work the same way as the monobytes ones
class processJ
{
public function __construct()
{
mb_regex_encoding("SJIS");
}
public function __destruct()
{
}
//this method split a sjis encoded japanese string by character an return an array
public function charSpliter($str)
{
$token = array();
while(1)
{
$bytes = mb_ereg("[一-龠]|[ぁ-ん]|[ァ-ヴー]|[a-zA-Z0-9]|[a-zA-Z0-9]", $str, $match);
if ($bytes === false) {
break;
} else {
$match = $match[0];
$token[] = $match;
}
$pos = strpos($str, $match);
$str = substr($str, $pos+$bytes);
}
return $token;
}
public function substrJ($str, $start, $lenght = NULL)
{
$strToken = $this->charSpliter($str);
$end = !empty($lenght) ? $lenght : count($strToken);
$substr = "";
for($i = $start; $i < $end; $i++)
{
$substr .= $strToken[$i];
}
return $substr;
}
public function strlenJ($str)
{
$strlen = $this->charSpliter($str);
return count($strlen);
}
public function strposJ($haystack, $needle)
{
$strToken = $this->charSpliter($haystack);
$needleToken = $this->charSpliter($needle);
$tokenLen = count($strToken);
$needleLen = count($needleToken);
for($i = 0; $i < $tokenLen; $i++)
{
if($strToken[$i] == $needleToken[0])
{
for($j = 0; $j < $needleLen; $j++)
{
if($needleToken[$j] !== $strToken[($i+$j)])
{
continue 2;
}
}
return $i;
}
}
}
public function str_replaceJ($search, $replace, $subject)
{
return $this->substrJ($subject,0,$this->strposJ($subject,$search)).$replace.$this->substrJ($subject,($this->strposJ($subject,$search)+$this->strlenJ($search)),$this->strlenJ($subject));
}
}
$test = new processJ();
print($test->substrJ("ようこそcyberblogへ漢字",0,4)); > "ようこそ"
print($test->strlenJ("ようこそcyberblogへ漢字")); > 16
print($test->strposJ("ようこそcyberblogへ漢字", "cyber")); > 4
print($test->str_replaceJ("cyberblog","サイバーブログ","ようこそcyberblogへ漢字")); > "ようこそサイバーブログへ漢字"
?>
have fun
