3 namespace Drupal\Tests\Component\Transliteration;
5 use Drupal\Component\Transliteration\PhpTransliteration;
6 use Drupal\Component\Utility\Random;
7 use Drupal\Tests\UnitTestCase;
8 use org\bovigo\vfs\vfsStream;
11 * Tests Transliteration component functionality.
13 * @group Transliteration
15 * @coversDefaultClass \Drupal\Component\Transliteration\PhpTransliteration
17 class PhpTransliterationTest extends UnitTestCase {
20 * Tests the PhpTransliteration::removeDiacritics() function.
22 * @param string $original
23 * The language code to test.
24 * @param string $expected
25 * The expected return from PhpTransliteration::removeDiacritics().
27 * @dataProvider providerTestPhpTransliterationRemoveDiacritics
29 public function testRemoveDiacritics($original, $expected) {
30 $transliterator_class = new PhpTransliteration();
31 $result = $transliterator_class->removeDiacritics($original);
32 $this->assertEquals($expected, $result);
36 * Provides data for self::testRemoveDiacritics().
39 * An array of arrays, each containing the parameters for
40 * self::testRemoveDiacritics().
42 public function providerTestPhpTransliterationRemoveDiacritics() {
44 // Test all characters in the Unicode range 0x00bf to 0x017f.
45 ['ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ', 'AAAAAAÆCEEEEIIII'],
46 ['ÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß', 'ÐNOOOOO×OUUUUYÞß'],
47 ['àáâãäåæçèéêëìíîï', 'aaaaaaæceeeeiiii'],
48 ['ðñòóôõö÷øùúûüýþÿ', 'ðnooooo÷ouuuuyþy'],
49 ['ĀāĂ㥹ĆćĈĉĊċČčĎď', 'AaAaAaCcCcCcCcDd'],
50 ['ĐđĒēĔĕĖėĘęĚěĜĝĞğ', 'DdEeEeEeEeEeGgGg'],
51 ['ĠġĢģĤĥĦħĨĩĪīĬĭĮį', 'GgGgHhHhIiIiIiIi'],
52 ['İıIJijĴĵĶķĸĹĺĻļĽľĿ', 'IiIJijJjKkĸLlLlLlL'],
53 ['ŀŁłŃńŅņŇňʼnŊŋŌōŎŏ', 'lLlNnNnNnʼnŊŋOoOo'],
54 ['ŐőŒœŔŕŖŗŘřŚśŜŝŞş', 'OoŒœRrRrRrSsSsSs'],
55 ['ŠšŢţŤťŦŧŨũŪūŬŭŮů', 'SsTtTtTtUuUuUuUu'],
56 ['ŰűŲųŴŵŶŷŸŹźŻżŽž', 'UuUuWwYyYZzZzZz'],
58 // Test all characters in the Unicode range 0x01CD to 0x024F.
60 ['ǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟ', 'iOoUuUuUuUuUuǝAa'],
61 ['ǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ', 'AaǢǣGgGgKkOoOoǮǯ'],
62 ['ǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿ', 'jDZDzdzGgǶǷNnAaǼǽOo'],
63 ['ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏ', 'AaAaEeEeIiIiOoOo'],
64 ['ȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟ', 'RrRrUuUuSsTtȜȝHh'],
65 ['ȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯ', 'ȠȡȢȣZzAaEeOoOoOo'],
66 ['ȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ', 'OoYylntjȸȹACcLTs'],
67 ['ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ', 'zɁɂBUɅEeJjQqRrYy'],
72 * Tests the PhpTransliteration class.
74 * @param string $langcode
75 * The language code to test.
76 * @param string $original
77 * The original string.
78 * @param string $expected
79 * The expected return from PhpTransliteration::transliterate().
80 * @param string $unknown_character
81 * (optional) The character to substitute for characters in $string without
82 * transliterated equivalents. Defaults to '?'.
83 * @param int $max_length
84 * (optional) If provided, return at most this many characters, ensuring
85 * that the transliteration does not split in the middle of an input
86 * character's transliteration.
88 * @dataProvider providerTestPhpTransliteration
90 public function testPhpTransliteration($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) {
91 $transliterator_class = new PhpTransliteration();
92 $actual = $transliterator_class->transliterate($original, $langcode, $unknown_character, $max_length);
93 $this->assertSame($expected, $actual);
97 * Provides data for self::testPhpTransliteration().
100 * An array of arrays, each containing the parameters for
101 * self::testPhpTransliteration().
103 public function providerTestPhpTransliteration() {
104 $random_generator = new Random();
105 $random = $random_generator->string(10);
106 // Make some strings with two, three, and four-byte characters for testing.
107 // Note that the 3-byte character is overridden by the 'kg' language.
108 $two_byte = 'Ä Ö Ü Å Ø äöüåøhello';
109 // This is a Cyrrillic character that looks something like a u. See
110 // http://www.unicode.org/charts/PDF/U0400.pdf
111 $three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8');
112 // This is a Canadian Aboriginal character like a triangle. See
113 // http://www.unicode.org/charts/PDF/U1400.pdf
114 $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
115 // These are two Gothic alphabet letters. See
116 // http://wikipedia.org/wiki/Gothic_alphabet
117 // They are not in our tables, but should at least give us '?' (unknown).
118 $five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8');
121 // Each test case is (language code, input, output).
122 // Test ASCII in English.
123 ['en', $random, $random],
124 // Test ASCII in some other language with no overrides.
125 ['fr', $random, $random],
126 // Test 3 and 4-byte characters in a language without overrides.
127 // Note: if the data tables change, these will need to change too! They
128 // are set up to test that data table loading works, so values come
129 // directly from the data files.
130 ['fr', $three_byte, 'c'],
131 ['fr', $four_byte, 'wii'],
132 // Test 5-byte characters.
133 ['en', $five_byte, '??'],
134 // Test a language with no overrides.
135 ['en', $two_byte, 'A O U A O aouaohello'],
136 // Test language overrides provided by core.
137 ['de', $two_byte, 'Ae Oe Ue A O aeoeueaohello'],
138 ['de', $random, $random],
139 ['dk', $two_byte, 'A O U Aa Oe aouaaoehello'],
140 ['dk', $random, $random],
141 ['kg', $three_byte, 'ts'],
142 // Test strings in some other languages.
143 // Turkish, provided by drupal.org user Kartagis.
144 ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'],
145 // Illegal/unknown unicode.
146 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?'],
148 ['de', $two_byte, 'Ae Oe', '?', 5],
153 * Tests the transliteration with max length.
155 public function testTransliterationWithMaxLength() {
156 $transliteration = new PhpTransliteration();
158 // Test with max length, using German. It should never split up the
159 // transliteration of a single character.
160 $input = 'Ä Ö Ü Å Ø äöüåøhello';
161 $trunc_output = 'Ae Oe Ue A O aeoe';
163 $this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 17), 'Truncating to 17 characters works');
164 $this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 18), 'Truncating to 18 characters works');
168 * Tests inclusion is safe.
170 * @covers ::readLanguageOverrides
172 public function testSafeInclude() {
173 // The overrides in the transliteration data directory transliterates 0x82
174 // into "safe" but the overrides one directory higher transliterates the
175 // same character into "security hole". So by using "../index" as the
176 // language code we can test the ../ is stripped from the langcode.
177 vfsStream::setup('transliteration', NULL, [
178 'index.php' => '<?php $overrides = ["../index" => [0x82 => "security hole"]];',
180 'index.php' => '<?php $overrides = ["../index" => [0x82 => "safe"]];',
183 $transliteration = new PhpTransliteration(vfsStream::url('transliteration/dir'));
184 $transliterated = $transliteration->transliterate(chr(0xC2) . chr(0x82), '../index');
185 $this->assertSame($transliterated, 'safe');