PHP-BRIEF   17

transliteration_data.php

Guest on 10th August 2021 04:55:53 PM

  1. <?php
  2.  
  3. /**
  4.  * @file
  5.  * Unifies formats of transliteration data from various sources.
  6.  *
  7.  * A few notes about this script:
  8.  * - The functions in this file are NOT SECURE, because they use PHP functions
  9.  *   like eval(). Absolutely do not run this script unless you trust the data
  10.  *   files used for input.
  11.  * - You will need to change the name of this file to remove the .txt extension
  12.  *   before running it (it has been given this name so that you cannot run it
  13.  *   by mistake). When you do that, move it out of your web root as well so
  14.  *   that it cannot be run via a URL, and run the script via the PHP command
  15.  *   at a command prompt.
  16.  * - This script, depending on which portions of it you run, depends on having
  17.  *   input data from various sources in sub-directories below where this file
  18.  *   is located. The data inputs are as follows:
  19.  *   - Existing Drupal Core transliteration data: Sub-directory 'data'; comes
  20.  *     from core/lib/Drupal/Component/Transliteration/data
  21.  *   - Midgardmvc data: Sub-directory 'utf8_to_ascii_db'; download from
  22.  *     https://github.com/bergie/midgardmvc_helper_urlize/downloads
  23.  *   - CPAN Text-Unidecode data: Sub-directory 'Unidecode'; download from
  24.  *     http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
  25.  *   - Node.js project: Sub-directory 'unidecoder_data'; download from
  26.  *     https://github.com/bitwalker/stringex/downloads
  27.  *   - JUnidecode project: Sub-directory 'junidecode'; download source from
  28.  *     http://www.ippatsuman.com/projects/junidecode/index.html
  29.  * - You will also need to make directory 'outdata' to hold output.
  30.  * - If you plan to use the 'intl' data, you will also need to have the PECL
  31.  *   packages 'yaml' and 'intl' installed.  See
  32.  *   http://php.net/manual/install.pecl.downloads.php for generic PECL
  33.  *   package installation instructions. The following commands on Ubuntu Linux
  34.  *   will install yaml and intl packages:
  35.  *   @code
  36.  *   sudo apt-get install libyaml-dev
  37.  *   sudo pecl install yaml
  38.  *   sudo apt-get install php5-intl
  39.  *   sudo apt-get install libicu-dev
  40.  *   sudo pecl install intl
  41.  *   @endcode
  42.  *   After running these commands, you will need to make sure
  43.  *   'extension=intl.so' and 'extension=yaml.so' are added to the php.ini file
  44.  *   that is in use for the PHP command-line command.
  45.  * - When you have collected all of the data and installed the required
  46.  *   packages, you will need to find the specific commands below that you want
  47.  *   to use and un-comment them. The preferred data source for Drupal Core is
  48.  *   the PECL 'intl' package, and the line that needs to be un-commented in
  49.  *   order to make a Drupal Core patch is:
  50.  *   @code
  51.  *   patch_drupal('outdata');
  52.  *   @endcode
  53.  * - The functions are documented in more detail in their headers where they
  54.  *   are defined. Many have parameters that you can use to change the output.
  55.  */
  56.  
  57. // Commands to read various data sources:
  58. // $data = read_drupal_data();
  59. // $data = read_midgard_data();
  60. // $data = read_cpan_data();
  61. // $data = read_nodejs_data();
  62. // $data = read_intl_data();
  63. // $data = read_junidecode_data();
  64.  
  65. // After running a read_*_data() function, you can print out the data
  66. // (it will make a LOT of output):
  67. // print_r($data);
  68.  
  69. // Command to read in all of data sources and output in CSV format, explaining
  70. // the differences:
  71. // read_all_to_csv();
  72.  
  73. // Command to patch Drupal Core data, using the intl data set, and put the
  74. // resulting changed data files in the 'outdata' directory:
  75. patch_drupal('outdata');
  76.  
  77. /**
  78.  * Reads in all transliteration data and outputs differences in CSV format.
  79.  *
  80.  * Each data set is compared to the Drupal Core reference data set, and the
  81.  * differences are noted. The data must be in the locations noted in the
  82.  * file header above. The CSV output has several columns. The first one is the
  83.  * Unicode character code. The next columns contain the transliteration of
  84.  * that character in each of the data sets. The last column, tells what the
  85.  * differences are between the Drupal Core reference set and the other data
  86.  * sets:
  87.  * - missing: The target set is missing data that the Drupal set has.
  88.  * - provided: The target set has provided data that Drupal does not have.
  89.  * - case: The target and Drupal set output differ only in upper/lower case.
  90.  * - different: The target and Drupal set output differ in more than just case.
  91.  *
  92.  * @param bool $print_all
  93.  *   TRUE to print all data; FALSE (default) to print just data where there
  94.  *   are differences between the Drupal set and other data sources.
  95.  * @param bool $print_missing
  96.  *   TRUE to print cases where one of the non-Drupal sets is missing information
  97.  *   and that is the only difference; FALSE (default) to include these rows.
  98.  */
  99. function read_all_to_csv($print_all = FALSE, $print_missing = FALSE) {
  100.   $data = array();
  101.   $types = array('drupal', 'midgard', 'cpan', 'nodejs', 'junidecode', 'intl');
  102.  
  103.   // Alternatively, if you just want to compare a couple of data sets, you can
  104.   // uncomment and edit the following line:
  105.   // $types = array('drupal', 'intl');
  106.  
  107.   // Read in all the data.
  108.   foreach ($types as $type) {
  109.     $data[$type] = call_user_func('read_' . $type . '_data');
  110.   }
  111.  
  112.   // Print CSV header row.
  113.   print "character,";
  114.   print implode(',', $types);
  115.   print ",why\n";
  116.  
  117.   // Go through all the banks of character data.
  118.   for ($bank = 0; $bank < 256; $bank++) {
  119.  
  120.     // Go through characters in bank; skip pure ASCII characters.
  121.     $start = ($bank == 0) ? 0x80 : 0;
  122.     for ($chr = $start; $chr < 256; $chr++) {
  123.  
  124.       // Gather the data together for this character.
  125.       $row = array();
  126.       foreach ($types as $type) {
  127.         $row[$type] = (isset($data[$type][$bank][$chr]) && is_string($data[$type][$bank][$chr])) ? $data[$type][$bank][$chr] : '';
  128.       }
  129.  
  130.       // Only print if there are differences or we are printing all data.
  131.       $print = $print_all;
  132.       $ref = $row['drupal'];
  133.       $why = array();
  134.       foreach ($types as $type) {
  135.         // Try to characterize what the differences are.
  136.         if ($row[$type] != $ref) {
  137.           if ($row[$type] == '') {
  138.             $why['missing'] = 'missing';
  139.             if ($print_missing) {
  140.               $print = TRUE;
  141.             }
  142.           }
  143.           elseif ($ref == '') {
  144.             $why['provided'] = 'provided';
  145.             $print = TRUE;
  146.           }
  147.           elseif ($row[$type] == strtolower($ref) || $row[$type] == strtoupper($ref)) {
  148.             $why['case'] = 'case';
  149.             $print = TRUE;
  150.           }
  151.           else {
  152.             $why['different'] = 'different';
  153.             $print = TRUE;
  154.           }
  155.         }
  156.       }
  157.  
  158.       // Print the data line.
  159.       if ($print) {
  160.         print '0x' . sprintf('%04x', 256 * $bank + $chr) . ',';
  161.         foreach ($row as $out) {
  162.           print '"' . addcslashes($out, '"') . '", ';
  163.         }
  164.         print implode(':', $why);
  165.         print "\n";
  166.       }
  167.     }
  168.   }
  169. }
  170.  
  171. /**
  172.  * Reads in 'intl' transliteration data and writes out changed Drupal files.
  173.  *
  174.  * Writes out the Drupal data files that would have to change to make our data
  175.  * match the intl data set.
  176.  *
  177.  * @param string $outdir
  178.  *   Directory to put the patched data files in (under where the script is
  179.  *   being run).
  180.  */
  181. function patch_drupal($outdir) {
  182.   $data = array();
  183.  
  184.   // Note that this is hard-wired below. Changing this line will have no
  185.   // effect except to break this function.
  186.   $types = array('drupal', 'intl');
  187.  
  188.   // Read in all the data.
  189.   foreach ($types as $type) {
  190.     $data[$type] = call_user_func('read_' . $type . '_data');
  191.   }
  192.  
  193.   // Go through all the banks of character data.
  194.   for ($bank = 0; $bank < 256; $bank++) {
  195.     $print_bank = FALSE;
  196.  
  197.     // Go through characters in bank; skip pure ASCII characters.
  198.     $start = ($bank == 0) ? 0x80 : 0;
  199.     $newdata = array();
  200.     for ($chr = 0; $chr < 256; $chr++) {
  201.       // Fill up the start of the ASCII range.
  202.       if ($chr < $start) {
  203.         $newdata[$chr] = chr($chr);
  204.         continue;
  205.       }
  206.  
  207.       // Figure out what characters we actually have.
  208.       $drupal = isset($data['drupal'][$bank][$chr]) ? $data['drupal'][$bank][$chr] : NULL;
  209.       // Note that for intl, we only want to keep the transliteration if it
  210.       // has something other than '' in it.
  211.       $intl = isset($data['intl'][$bank][$chr]) && $data['intl'][$bank][$chr] != '' ? $data['intl'][$bank][$chr] : NULL;
  212.       // Make sure we have something in the Drupal data set, in case we need
  213.       // to print.
  214.       $newdata[$chr] = $drupal;
  215.  
  216.       if (!isset($intl)) {
  217.         continue;
  218.       }
  219.       if (!isset($drupal) || $drupal != $intl) {
  220.         $print_bank = TRUE;
  221.         $newdata[$chr] = $intl;
  222.       }
  223.     }
  224.  
  225.     // If we found a difference, output a data file.
  226.     if ($print_bank) {
  227.       write_data_file($newdata, $bank, $outdir);
  228.     }
  229.   }
  230. }
  231.  
  232. /**
  233.  * Reads in the Drupal Core generic transliteration data set.
  234.  *
  235.  * The data is expected to be in files xNN.php in directory 'data' under
  236.  * this file's directory.
  237.  *
  238.  * @return array
  239.  *   Nested array of transliteration data. Outer keys are the first two
  240.  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
  241.  *   level is the other two bytes, and the values are the transliterations.
  242.  *
  243.  * @see PhpTransliteration::readGenericData()
  244.  */
  245. function read_drupal_data() {
  246.   $dir = __DIR__ . '/data';
  247.   $out = array();
  248.  
  249.   // Read data files.
  250.   for ($bank = 0; $bank < 256; $bank++) {
  251.     $base = array();
  252.     $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
  253.     if (is_file($file)) {
  254.       include($file);
  255.     }
  256.     $out[$bank] = $base;
  257.   }
  258.  
  259.   return $out;
  260. }
  261.  
  262. /**
  263.  * Reads in the MidgardMVC transliteration data.
  264.  *
  265.  * The data is expected to be in files xNN.php in directory utf8_to_ascii_db
  266.  * under the directory where this file resides. It can be downloaded from
  267.  * https://github.com/bergie/midgardmvc_helper_urlize/downloads.
  268.  *
  269.  * @return array
  270.  *   Nested array of transliteration data. Outer keys are the first two
  271.  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
  272.  *   level is the other two bytes, and the values are the transliterations.
  273.  */
  274. function read_midgard_data() {
  275.   $dir = __DIR__ . '/utf8_to_ascii_db';
  276.   $out = array();
  277.  
  278.   // Read data files.
  279.   for ($bank = 0; $bank < 256; $bank++) {
  280.     $UTF8_TO_ASCII = array($bank => array());
  281.     $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
  282.     if (is_file($file)) {
  283.       include($file);
  284.     }
  285.     $base = $UTF8_TO_ASCII[$bank];
  286.  
  287.     // For unknown characters, these files have '[?]' in them. Replace with
  288.     // NULL for compatibility with our data.
  289.     $base = array_map('_replace_question_with_null', $base);
  290.     $out[$bank] = $base;
  291.   }
  292.  
  293.   return $out;
  294. }
  295.  
  296. /**
  297.  * Reads in the CPAN Text::Unidecode data set.
  298.  *
  299.  * The data is expected to be in files xNN.pm in directory 'Unidecode' under
  300.  * this file's directory. It can be downloaded from
  301.  * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm.
  302.  *
  303.  * @return array
  304.  *   Nested array of transliteration data. Outer keys are the first two
  305.  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
  306.  *   level is the other two bytes, and the values are the transliterations.
  307.  */
  308. function read_cpan_data() {
  309.   $dir = __DIR__ . '/Unidecode';
  310.   $out = array();
  311.  
  312.   // Read data files.
  313.   for ($bank = 0; $bank < 256; $bank++) {
  314.     $base = array();
  315.     $file = $dir . '/x' . sprintf('%02x', $bank) . '.pm';
  316.     if (is_file($file)) {
  317.       $base = _cpan_read_file($file);
  318.     }
  319.     $out[$bank] = $base;
  320.   }
  321.  
  322.   return $out;
  323. }
  324.  
  325. /**
  326.  * Reads in the data in a single file from the Text::Unidecode CPAN project.
  327.  *
  328.  * @param string $file
  329.  *   File to read from.
  330.  *
  331.  * @return array
  332.  *   Data read from the file.
  333.  *
  334.  * @see read_cpan_data()
  335.  */
  336. function _cpan_read_file($file) {
  337.  
  338.   $contents = file($file);
  339.   $save = '';
  340.   foreach ($contents as $line) {
  341.     // Discard lines starting with # or $. The first line seems to have a
  342.     // comment starting with #, the second has a Perl line like
  343.     // $Text::Unidecode::Char[0x04] = [, -- and we do not want either.
  344.     if (preg_match('|^\s*[#\$]|', $line)) {
  345.       continue;
  346.     }
  347.  
  348.     // Discard lines ending with semi-colons, which we also don't want
  349.     // (there seem to be two of these lines at the end of the files).
  350.     if (preg_match('|;\s*$|', $line)) {
  351.       continue;
  352.     }
  353.  
  354.     // Replace '[?]' with nothing (that means "don't know how to
  355.     // transliterate"). In some files, this is encoded as qq{[?]} or
  356.     // qq{[?] } instead.
  357.     $line = str_replace('qq{[?]}', 'NULL', $line);
  358.     $line = str_replace('qq{[?] }', 'NULL', $line);
  359.     $line = str_replace("'[?]'", 'NULL', $line);
  360.  
  361.     // Replace qq{} with either "" or '' or nothing, depending on what is
  362.     // inside it.
  363.     $line = str_replace('qq{\{}', "'{'", $line);
  364.     $line = str_replace('qq{\}}', "'}'", $line);
  365.     $line = str_replace('qq{\} }', "'} '", $line);
  366.     $line = str_replace("qq{\\\\}", '"\\\\"', $line);
  367.     $line = str_replace("qq{\\", "qq{'", $line);
  368.     $line = str_replace("qq{\"'}", "\"\\\"'\"", $line);
  369.     $line = preg_replace('|qq\{([^\'\}]+)\}|', "'$1'", $line);
  370.     $line = preg_replace('|qq\{([^\}]+)\}|', '"$1"', $line);
  371.  
  372.     $save .= $line;
  373.   }
  374.  
  375.   // Now we should have a string that looks like:
  376.   // 'a', 'b', ...
  377.   // Evaluate as an array.
  378.   $save = 'return array(' . $save . ');';
  379.  
  380.   $data = @eval($save);
  381.   if (isset($data) && is_array($data)) {
  382.     $data = array_map('_replace_hex_with_character', $data);
  383.   }
  384.   else {
  385.     // There was a problem, so throw an error and exit.
  386.     print "Problem in evaluating $file\n";
  387.     print $save;
  388.     eval($save);
  389.     exit();
  390.   }
  391.  
  392.   // For unknown characters, these files may still have '[?]' in them. Replace
  393.   // with NULL for compatibility with our data.
  394.   $data = array_map('_replace_question_with_null', $data);
  395.  
  396.   return $data;
  397. }
  398.  
  399. /**
  400.  * Reads in the Node.js transliteration data.
  401.  *
  402.  * The data is expected to be in files xNN.yml in directory unidecoder_data
  403.  * under the directory where this file resides. It can be downloaded from
  404.  * https://github.com/bitwalker/stringex/downloads. You also need the PECL
  405.  * 'yaml' extension installed for this function to work.
  406.  *
  407.  * @return array
  408.  *   Nested array of transliteration data. Outer keys are the first two
  409.  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
  410.  *   level is the other two bytes, and the values are the transliterations.
  411.  */
  412. function read_nodejs_data() {
  413.   $dir = __DIR__ . '/unidecoder_data';
  414.   $out = array();
  415.  
  416.   // Read data files.
  417.   for ($bank = 0; $bank < 256; $bank++) {
  418.     $base = array();
  419.     $file = $dir . '/x' . sprintf('%02x', $bank) . '.yml';
  420.     if (is_file($file)) {
  421.       $base = yaml_parse_file($file);
  422.       // For unknown characters, these files have '[?]' in them. Replace with
  423.       // NULL for compatibility with our data.
  424.       $base = array_map('_replace_question_with_null', $base);
  425.     }
  426.     $out[$bank] = $base;
  427.   }
  428.  
  429.   return $out;
  430. }
  431.  
  432. /**
  433.  * Loads the PECL 'intl' Transliterator class's transliteration data.
  434.  *
  435.  * You need to have the PECL 'intl' package installed for this to work.
  436.  *
  437.  * @return array
  438.  *   Nested array of transliteration data. Outer keys are the first two
  439.  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
  440.  *   level is the other two bytes, and the values are the transliterations.
  441.  */
  442. function read_intl_data() {
  443.   // In order to transliterate, you first have to create a transliterator
  444.   // object. This needs a list of transliteration operations. You can get a
  445.   // list of available operations with:
  446.   //   print_r(Transliterator::listIDs()); exit();
  447.   // And a few of these are documented on
  448.   // http://userguide.icu-project.org/transforms/general and
  449.   // http://www.unicode.org/reports/tr15/ (for normalizations).
  450.   // There are also maps to the Unicode characters at:
  451.   //  http://www.unicode.org/roadmaps/bmp/
  452.   //  http://www.unicode.org/charts/nameslist/
  453.   $ops = '';
  454.  
  455.   // The first step in any transform: separate out accents and remove them.
  456.   $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;';
  457.  
  458.   // Then you need to do a bunch of language-specific or script-specific
  459.   // transliterations. Here is hopefully a representative set. There are
  460.   // quite a few scripts that don't appear to have rules currently, such
  461.   // as Etheopian.
  462.   $ops .= 'Greek-Latin; ';
  463.   $ops .= 'Cyrillic-Latin; ';
  464.   $ops .= 'Armenian-Latin; ';
  465.   $ops .= 'Hebrew-Latin; ';
  466.   $ops .= 'Arabic-Latin; ';
  467.   $ops .= 'Syriac-Latin; ';
  468.   $ops .= 'Thaana-Latin; ';
  469.   $ops .= 'Devanagari-Latin; ';
  470.   $ops .= 'Bengali-Latin; ';
  471.   $ops .= 'Gurmukhi-Latin; ';
  472.   $ops .= 'Gujarati-Latin; ';
  473.   $ops .= 'Oriya-Latin; ';
  474.   $ops .= 'Tamil-Latin; ';
  475.   $ops .= 'Telugu-Latin; ';
  476.   $ops .= 'Kannada-Latin; ';
  477.   $ops .= 'Malayalam-Latin; ';
  478.   $ops .= 'Thai-Latin; ';
  479.   $ops .= 'Georgian-Latin; ';
  480.   $ops .= 'Hangul-Latin; ';
  481.   $ops .= 'Mongolian-Latin/BGN; ';
  482.   $ops .= 'Jamo-Latin; ';
  483.   $ops .= 'Katakana-Latin; ';
  484.   $ops .= 'Any-Latin; ';
  485.  
  486.   // Finally, after transforming to Latin, transform to ASCII.
  487.   $ops .= 'Latin-ASCII; ';
  488.  
  489.   // Remove any remaining accents and recompose.
  490.   $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;';
  491.  
  492.   $trans = Transliterator::create($ops);
  493.   $out = array();
  494.  
  495.   // Transliterate all possible characters.
  496.   for ($bank = 0; $bank < 256; $bank++) {
  497.     $data = array();
  498.     for ($chr = 0; $chr < 256; $chr++) {
  499.       // Skip the UTF-16 and "private use" ranges completely.
  500.       $OK = ($bank <= 0xd8 || $bank > 0xf8);
  501.  
  502.       $result = $OK ? $trans->transliterate(mb_convert_encoding(pack('n', 256 * $bank + $chr), 'UTF-8', 'UTF-16BE')) : '';
  503.  
  504.       // See if we have managed to transliterate this to ASCII or not. If not,
  505.       // return NULL instead of this character.
  506.       $max = chr(127);
  507.       foreach (preg_split('//u', $result, 0, PREG_SPLIT_NO_EMPTY) as $character) {
  508.         if ($character > $max) {
  509.           $OK = $OK && FALSE;
  510.           break;
  511.         }
  512.       }
  513.       $data[$chr] = ($OK) ? $result : NULL;
  514.     }
  515.     $out[$bank] = $data;
  516.   }
  517.  
  518.   return $out;
  519. }
  520.  
  521. /**
  522.  * Reads in the JUnidecode data set.
  523.  *
  524.  * The data is expected to be in files XNN.java in directory 'junidecode' under
  525.  * this file's directory. It can be downloaded from
  526.  * http://www.ippatsuman.com/projects/junidecode/index.html
  527.  *
  528.  * @return array
  529.  *   Nested array of transliteration data. Outer keys are the first two
  530.  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
  531.  *   level is the other two bytes, and the values are the transliterations.
  532.  */
  533. function read_junidecode_data() {
  534.   $dir = __DIR__ . '/junidecode';
  535.   $out = array();
  536.  
  537.   // Read data files.
  538.   for ($bank = 0; $bank < 256; $bank++) {
  539.     $base = array();
  540.     $file = $dir . '/X' . sprintf('%02x', $bank) . '.java';
  541.     if (is_file($file)) {
  542.       $base = _junidecode_read_file($file);
  543.     }
  544.     $out[$bank] = $base;
  545.   }
  546.  
  547.   return $out;
  548. }
  549.  
  550. /**
  551.  * Reads in the data in a single file from the JUnidecode project.
  552.  *
  553.  * @param string $file
  554.  *   File to read from.
  555.  *
  556.  * @return array
  557.  *   Data read from the file.
  558.  *
  559.  * @see read_junidecode_data()
  560.  */
  561. function _junidecode_read_file($file) {
  562.   $contents = file($file);
  563.   $save = '';
  564.   foreach ($contents as $line) {
  565.     // Discard lines starting with * or / or package or class or public or },
  566.     // to get rid of comments and Java code.
  567.     if (preg_match('|^\s*[\*/\}]|', $line)) {
  568.       continue;
  569.     }
  570.     if (preg_match('/^\s*package|public|class/', $line)) {
  571.       continue;
  572.     }
  573.  
  574.     // Some of the lines look like this:
  575.     //      new String("" + (char) 0x00), // 0x00
  576.     // Transform to be '0x00,'
  577.     $line = preg_replace('|^\s*new\s+String\s*\(\s*""\s*\+\s*\(char\)\s+0x([0-9]+).*$|', '0x$1,', $line);
  578.  
  579.     // Strings are in double quotes, yet many have \' in them.
  580.     $line = str_replace("\'", "'", $line);
  581.  
  582.     // Everything else should probably be OK -- the lines are like:
  583.     //  "Ie", // 0x00
  584.     $save .= $line;
  585.   }
  586.  
  587.   // Evaluate as an array.
  588.   $save = 'return array(' . $save . ');';
  589.  
  590.   $data = @eval($save);
  591.   if (isset($data) && is_array($data)) {
  592.     $data = array_map('_replace_hex_with_character', $data);
  593.     $data = array_map('_replace_question_with_null', $data);
  594.   }
  595.   else {
  596.     // There was a problem, so throw an error and exit.
  597.     print "Problem in evaluating $file\n";
  598.     print $save;
  599.     eval($save);
  600.     exit();
  601.   }
  602.  
  603.   return $data;
  604. }
  605.  
  606. /**
  607.  * Callback for array_map(): Returns $data, with '[?]' replaced with NULL.
  608.  */
  609. function _replace_question_with_null($data) {
  610.   return ($data == '[?]' || $data == '[?] ') ? NULL : $data;
  611. }
  612.  
  613. /**
  614.  * Callback for array_map(): Replaces '\xNN' with the actual character.
  615.  */
  616. function _replace_hex_with_character($item) {
  617.   if (strpos($item, '\x') === 0) {
  618.     $item = eval($item);
  619.   }
  620.   return $item;
  621. }
  622.  
  623. /**
  624.  * Writes a data file out in the standard Drupal Core data format.
  625.  *
  626.  * @param array $data
  627.  *   Array of data to write out.
  628.  * @param string $bank
  629.  *   Bank of characters it belongs to.
  630.  * @param string $dir
  631.  *   Output directory.
  632.  */
  633. function write_data_file($data, $bank, $outdir) {
  634.   $dir = __DIR__ . '/' . $outdir;
  635.   $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
  636.  
  637.   $out = '';
  638.   $out .= "<?php\n\n/**\n * @file\n * Generic transliteration data for the PhpTransliteration class.\n */\n\n\$base = array(\n";
  639.  
  640.   // The 00 file skips the ASCII range
  641.   $start = 0;
  642.   if ($bank == 0) {
  643.     $start = 0x80;
  644.     $out .= "  // Note: to save memory plain ASCII mappings have been left out.\n";
  645.   }
  646.  
  647.   for ($line = $start; $line <= 0xf0; $line += 0x10) {
  648.     $out .= '  0x' . sprintf('%02X', $line) . ' =>';
  649.     $elems = array_values(array_slice($data, $line, 16));
  650.     for ($i = 0; $i < 16; $i++ ) {
  651.       if (isset($elems[$i])) {
  652.         $out .= " '" . addcslashes($elems[$i], "'\\") . "',";
  653.       }
  654.       else {
  655.         $out .= ' NULL,';
  656.       }
  657.     }
  658.     $out .= "\n";
  659.   }
  660.  
  661.   $out .= ");\n";
  662.  
  663.   file_put_contents($file, $out);
  664. }

Raw Paste


Login or Register to edit or fork this paste. It's free.