Blog

Convert HTML named entities to numeric entities

This code generates a lookup map using get_html_translation_table of named html entities to numeric entities. str_replace is then used on a string to convert those entities.

First two functions are from http://stackoverflow.com/a/12848889

function utf8_chr ($ord) {
  switch (TRUE) {
    case $ord < 0x80:
      return pack('C*', $ord & 0x7F);
    case $ord > 6) | 0xC0, ($ord & 0x3F) | 0x80);
    case $ord > 12) | 0xE0, (($ord & 0x0FC0) >> 6) | 0x80, ($ord & 0x3F) | 0x80);
    case $ord > 18) | 0xF0, (($ord & 0x03F000) >> 12) | 0x80, (($ord & 0x0FC0) >> 6) | 0x80, ($ord & 0x3F) | 0x80);
  }
  return FALSE;
}

function utf8_ord ($chr) {
  $bytes = array_values(unpack(‘C*’, $chr));
  switch (count($bytes)) {
    case 1:
      if ($bytes[0] < 0x80) {
        return $bytes[0];
      }
      break;
    case 2:
      if (($bytes[0] & 0xE0) === 0xC0 && ($bytes[1] & 0xC0) === 0x80) {
        return (($bytes[0] & 0x1F) << 6) | ($bytes[1] & 0x3F);
      }
      break;
    case 3:
      if (($bytes[0] & 0xF0) === 0xE0 && ($bytes[1] & 0xC0) === 0x80 && ($bytes[2] & 0xC0) === 0x80) {
        return (($bytes[0] & 0x0F) << 12) | (($bytes[1] & 0x3F) << 6) | ($bytes[2] & 0x3F);
      }
      break;
    case 4:
      if (($bytes[0] & 0xF8) === 0xF0 && ($bytes[1] & 0xC0) === 0x80 && ($bytes[2] & 0xC0) === 0x80 && ($bytes[3] & 0xC0) === 0x80) {
        return (($bytes[0] & 0x07) << 18) | (($bytes[1] & 0x3F) << 12) | (($bytes[2] & 0x3F) << 6) | ($bytes[3] & 0x3F);
      }
      break;
  }
  return FALSE;
}
$input = '<title>My Web Page</title> this & that™';
$map = array_map(function($a) { return '&#' . utf8_ord($a) . ';'; }, array_flip(get_html_translation_table(HTML_ENTITIES, ENT_HTML401)));
$output = str_replace(array_keys($map), array_values($map), $input);
echo "$outputn";

No Comments

Leave a Reply

Your email address will not be published. Required fields are marked *

Comment replies are not available offline