Punycode.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. <?php
  2. namespace TrueBV;
  3. use TrueBV\Exception\DomainOutOfBoundsException;
  4. use TrueBV\Exception\LabelOutOfBoundsException;
  5. /**
  6. * Punycode implementation as described in RFC 3492
  7. *
  8. * @link http://tools.ietf.org/html/rfc3492
  9. */
  10. class Punycode
  11. {
  12. /**
  13. * Bootstring parameter values
  14. *
  15. */
  16. const BASE = 36;
  17. const TMIN = 1;
  18. const TMAX = 26;
  19. const SKEW = 38;
  20. const DAMP = 700;
  21. const INITIAL_BIAS = 72;
  22. const INITIAL_N = 128;
  23. const PREFIX = 'xn--';
  24. const DELIMITER = '-';
  25. /**
  26. * Encode table
  27. *
  28. * @param array
  29. */
  30. protected static $encodeTable = array(
  31. 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
  32. 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
  33. 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
  34. );
  35. /**
  36. * Decode table
  37. *
  38. * @param array
  39. */
  40. protected static $decodeTable = array(
  41. 'a' => 0, 'b' => 1, 'c' => 2, 'd' => 3, 'e' => 4, 'f' => 5,
  42. 'g' => 6, 'h' => 7, 'i' => 8, 'j' => 9, 'k' => 10, 'l' => 11,
  43. 'm' => 12, 'n' => 13, 'o' => 14, 'p' => 15, 'q' => 16, 'r' => 17,
  44. 's' => 18, 't' => 19, 'u' => 20, 'v' => 21, 'w' => 22, 'x' => 23,
  45. 'y' => 24, 'z' => 25, '0' => 26, '1' => 27, '2' => 28, '3' => 29,
  46. '4' => 30, '5' => 31, '6' => 32, '7' => 33, '8' => 34, '9' => 35
  47. );
  48. /**
  49. * Character encoding
  50. *
  51. * @param string
  52. */
  53. protected $encoding;
  54. /**
  55. * Constructor
  56. *
  57. * @param string $encoding Character encoding
  58. */
  59. public function __construct($encoding = 'UTF-8')
  60. {
  61. $this->encoding = $encoding;
  62. }
  63. /**
  64. * Encode a domain to its Punycode version
  65. *
  66. * @param string $input Domain name in Unicode to be encoded
  67. * @return string Punycode representation in ASCII
  68. */
  69. public function encode($input)
  70. {
  71. $input = mb_strtolower($input, $this->encoding);
  72. $parts = explode('.', $input);
  73. foreach ($parts as &$part) {
  74. $length = strlen($part);
  75. if ($length < 1) {
  76. throw new LabelOutOfBoundsException(sprintf('The length of any one label is limited to between 1 and 63 octets, but %s given.', $length));
  77. }
  78. $part = $this->encodePart($part);
  79. }
  80. $output = implode('.', $parts);
  81. $length = strlen($output);
  82. if ($length > 255) {
  83. throw new DomainOutOfBoundsException(sprintf('A full domain name is limited to 255 octets (including the separators), %s given.', $length));
  84. }
  85. return $output;
  86. }
  87. /**
  88. * Encode a part of a domain name, such as tld, to its Punycode version
  89. *
  90. * @param string $input Part of a domain name
  91. * @return string Punycode representation of a domain part
  92. */
  93. protected function encodePart($input)
  94. {
  95. $codePoints = $this->listCodePoints($input);
  96. $n = static::INITIAL_N;
  97. $bias = static::INITIAL_BIAS;
  98. $delta = 0;
  99. $h = $b = count($codePoints['basic']);
  100. $output = '';
  101. foreach ($codePoints['basic'] as $code) {
  102. $output .= $this->codePointToChar($code);
  103. }
  104. if ($input === $output) {
  105. return $output;
  106. }
  107. if ($b > 0) {
  108. $output .= static::DELIMITER;
  109. }
  110. $codePoints['nonBasic'] = array_unique($codePoints['nonBasic']);
  111. sort($codePoints['nonBasic']);
  112. $i = 0;
  113. $length = mb_strlen($input, $this->encoding);
  114. while ($h < $length) {
  115. $m = $codePoints['nonBasic'][$i++];
  116. $delta = $delta + ($m - $n) * ($h + 1);
  117. $n = $m;
  118. foreach ($codePoints['all'] as $c) {
  119. if ($c < $n || $c < static::INITIAL_N) {
  120. $delta++;
  121. }
  122. if ($c === $n) {
  123. $q = $delta;
  124. for ($k = static::BASE;; $k += static::BASE) {
  125. $t = $this->calculateThreshold($k, $bias);
  126. if ($q < $t) {
  127. break;
  128. }
  129. $code = $t + (($q - $t) % (static::BASE - $t));
  130. $output .= static::$encodeTable[$code];
  131. $q = ($q - $t) / (static::BASE - $t);
  132. }
  133. $output .= static::$encodeTable[$q];
  134. $bias = $this->adapt($delta, $h + 1, ($h === $b));
  135. $delta = 0;
  136. $h++;
  137. }
  138. }
  139. $delta++;
  140. $n++;
  141. }
  142. $out = static::PREFIX . $output;
  143. $length = strlen($out);
  144. if ($length > 63 || $length < 1) {
  145. throw new LabelOutOfBoundsException(sprintf('The length of any one label is limited to between 1 and 63 octets, but %s given.', $length));
  146. }
  147. return $out;
  148. }
  149. /**
  150. * Decode a Punycode domain name to its Unicode counterpart
  151. *
  152. * @param string $input Domain name in Punycode
  153. * @return string Unicode domain name
  154. */
  155. public function decode($input)
  156. {
  157. $input = strtolower($input);
  158. $parts = explode('.', $input);
  159. foreach ($parts as &$part) {
  160. $length = strlen($part);
  161. if ($length > 63 || $length < 1) {
  162. throw new LabelOutOfBoundsException(sprintf('The length of any one label is limited to between 1 and 63 octets, but %s given.', $length));
  163. }
  164. if (strpos($part, static::PREFIX) !== 0) {
  165. continue;
  166. }
  167. $part = substr($part, strlen(static::PREFIX));
  168. $part = $this->decodePart($part);
  169. }
  170. $output = implode('.', $parts);
  171. $length = strlen($output);
  172. if ($length > 255) {
  173. throw new DomainOutOfBoundsException(sprintf('A full domain name is limited to 255 octets (including the separators), %s given.', $length));
  174. }
  175. return $output;
  176. }
  177. /**
  178. * Decode a part of domain name, such as tld
  179. *
  180. * @param string $input Part of a domain name
  181. * @return string Unicode domain part
  182. */
  183. protected function decodePart($input)
  184. {
  185. $n = static::INITIAL_N;
  186. $i = 0;
  187. $bias = static::INITIAL_BIAS;
  188. $output = '';
  189. $pos = strrpos($input, static::DELIMITER);
  190. if ($pos !== false) {
  191. $output = substr($input, 0, $pos++);
  192. } else {
  193. $pos = 0;
  194. }
  195. $outputLength = strlen($output);
  196. $inputLength = strlen($input);
  197. while ($pos < $inputLength) {
  198. $oldi = $i;
  199. $w = 1;
  200. for ($k = static::BASE;; $k += static::BASE) {
  201. $digit = static::$decodeTable[$input[$pos++]];
  202. $i = $i + ($digit * $w);
  203. $t = $this->calculateThreshold($k, $bias);
  204. if ($digit < $t) {
  205. break;
  206. }
  207. $w = $w * (static::BASE - $t);
  208. }
  209. $bias = $this->adapt($i - $oldi, ++$outputLength, ($oldi === 0));
  210. $n = $n + (int) ($i / $outputLength);
  211. $i = $i % ($outputLength);
  212. $output = mb_substr($output, 0, $i, $this->encoding) . $this->codePointToChar($n) . mb_substr($output, $i, $outputLength - 1, $this->encoding);
  213. $i++;
  214. }
  215. return $output;
  216. }
  217. /**
  218. * Calculate the bias threshold to fall between TMIN and TMAX
  219. *
  220. * @param integer $k
  221. * @param integer $bias
  222. * @return integer
  223. */
  224. protected function calculateThreshold($k, $bias)
  225. {
  226. if ($k <= $bias + static::TMIN) {
  227. return static::TMIN;
  228. } elseif ($k >= $bias + static::TMAX) {
  229. return static::TMAX;
  230. }
  231. return $k - $bias;
  232. }
  233. /**
  234. * Bias adaptation
  235. *
  236. * @param integer $delta
  237. * @param integer $numPoints
  238. * @param boolean $firstTime
  239. * @return integer
  240. */
  241. protected function adapt($delta, $numPoints, $firstTime)
  242. {
  243. $delta = (int) (
  244. ($firstTime)
  245. ? $delta / static::DAMP
  246. : $delta / 2
  247. );
  248. $delta += (int) ($delta / $numPoints);
  249. $k = 0;
  250. while ($delta > ((static::BASE - static::TMIN) * static::TMAX) / 2) {
  251. $delta = (int) ($delta / (static::BASE - static::TMIN));
  252. $k = $k + static::BASE;
  253. }
  254. $k = $k + (int) (((static::BASE - static::TMIN + 1) * $delta) / ($delta + static::SKEW));
  255. return $k;
  256. }
  257. /**
  258. * List code points for a given input
  259. *
  260. * @param string $input
  261. * @return array Multi-dimension array with basic, non-basic and aggregated code points
  262. */
  263. protected function listCodePoints($input)
  264. {
  265. $codePoints = array(
  266. 'all' => array(),
  267. 'basic' => array(),
  268. 'nonBasic' => array(),
  269. );
  270. $length = mb_strlen($input, $this->encoding);
  271. for ($i = 0; $i < $length; $i++) {
  272. $char = mb_substr($input, $i, 1, $this->encoding);
  273. $code = $this->charToCodePoint($char);
  274. if ($code < 128) {
  275. $codePoints['all'][] = $codePoints['basic'][] = $code;
  276. } else {
  277. $codePoints['all'][] = $codePoints['nonBasic'][] = $code;
  278. }
  279. }
  280. return $codePoints;
  281. }
  282. /**
  283. * Convert a single or multi-byte character to its code point
  284. *
  285. * @param string $char
  286. * @return integer
  287. */
  288. protected function charToCodePoint($char)
  289. {
  290. $code = ord($char[0]);
  291. if ($code < 128) {
  292. return $code;
  293. } elseif ($code < 224) {
  294. return (($code - 192) * 64) + (ord($char[1]) - 128);
  295. } elseif ($code < 240) {
  296. return (($code - 224) * 4096) + ((ord($char[1]) - 128) * 64) + (ord($char[2]) - 128);
  297. } else {
  298. return (($code - 240) * 262144) + ((ord($char[1]) - 128) * 4096) + ((ord($char[2]) - 128) * 64) + (ord($char[3]) - 128);
  299. }
  300. }
  301. /**
  302. * Convert a code point to its single or multi-byte character
  303. *
  304. * @param integer $code
  305. * @return string
  306. */
  307. protected function codePointToChar($code)
  308. {
  309. if ($code <= 0x7F) {
  310. return chr($code);
  311. } elseif ($code <= 0x7FF) {
  312. return chr(($code >> 6) + 192) . chr(($code & 63) + 128);
  313. } elseif ($code <= 0xFFFF) {
  314. return chr(($code >> 12) + 224) . chr((($code >> 6) & 63) + 128) . chr(($code & 63) + 128);
  315. } else {
  316. return chr(($code >> 18) + 240) . chr((($code >> 12) & 63) + 128) . chr((($code >> 6) & 63) + 128) . chr(($code & 63) + 128);
  317. }
  318. }
  319. }