# scripts/unicode/make_to_ascii_letters_tables.sh # This file is part of libpbe; see http://svn.chezphil.org/libpbe/ # (C) 2008 Philip Endecott # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Download the Unicode character database: # wget http://www.unicode.org/Public/5.1.0/ucd/UnicodeData.txt # Extract the decompositions: cat UnicodeData.txt | awk -F ';' '$6!="" {print $1,$6}' | awk '/ decompositions # Each line in decompositions starts with the source character and is followed by the # sequence of characters that it may be replaced by. This replacement needs to be # applied recursively: while true do awk '{if (FNR==NR) {d=""; for(i=2;i<=NF;i++) {d = d $i " "}; dec[$1]=d;} else {printf("%s ",$1); for(i=2;i<=NF;i++) { if ($i in dec) {printf("%s",dec[$i]);} else {printf("%s ",$i);} } printf("\n");} }' decompositions decompositions > new_decompositions if cmp decompositions new_decompositions then break fi mv new_decompositions decompositions done mv new_decompositions full_decompositions ######## # For the puposes of converting strings to text searchable ASCII equivalents, we're only # interested in determining the ASCII letter base characters. Strip all the other characters # in the expansion, and if none are left remove the line. awk '{printf("%s ",$1); for(i=2;i<=NF;i++) { n=strtonum("0x" $i); if (n>=65 && n<=90 || n>=97 && n<=122) printf("%s ",$i) } printf("\n"); }' full_decompositions | awk '{if (NF>1) print}' | # Now convert the letter codes to actual letters, and convert to lower case. awk '{printf("%s \"",$1); for(i=2;i<=NF;i++) { v=strtonum("0x" $i); c=sprintf("%c",v); printf("%c",tolower(c)); } printf("\"\n"); }' | # Add 'identity' conversions for a-z and 'tolower' conversions for A-Z. awk 'BEGIN {for(i=65;i<=90;i++) printf("%04X \"%c\"\n",i,i+32); for(i=97;i<=122;i++) printf("%04X \"%c\"\n",i,i);} {print}' > all_decompositions # Add some more local decompositions. cat local_decompositions >>all_decompositions # Split it up into one file per unicode page rm *.d0 awk '{page=substr($1,1,length($1)-2); print>(page ".d0")}' all_decompositions # Reformat them as struct initialisers: for i in *.d0 do n=`basename $i .d0` echo -n "char_expansion_page_${n}_t to_ascii_letters_page_${n} [] = {"; awk '{n=strtonum("0x" $1); a[n%256]=$2;} END {for(i=0;i<256;i++) { if (i in a) printf("%s, ",a[i]); else printf("\"\", "); } printf("};\n") }' $i done > to_ascii_letters_tables.cc ######## # This is similar to the preceeding section, except that digits are also retained. # Strip all the other characters in the expansion, and if none are left remove the line. awk '{printf("%s ",$1); for(i=2;i<=NF;i++) { n=strtonum("0x" $i); if (n>=65 && n<=90 || n>=97 && n<=122 || n>=48 && n<=57) printf("%s ",$i) } printf("\n"); }' full_decompositions | awk '{if (NF>1) print}' | # Now convert the alnum codes to actual alnums, and convert letters to lower case. awk '{printf("%s \"",$1); for(i=2;i<=NF;i++) { v=strtonum("0x" $i); c=sprintf("%c",v); printf("%c",tolower(c)); } printf("\"\n"); }' | # Add 'identity' conversions for a-z and 0-9 and 'tolower' conversions for A-Z. awk 'BEGIN {for(i=65;i<=90;i++) printf("%04X \"%c\"\n",i,i+32); for(i=97;i<=122;i++) printf("%04X \"%c\"\n",i,i); for(i=48;i<=57;i++) printf("%04X \"%c\"\n",i,i);} {print}' > all_decompositions # Add some more local decompositions. cat local_decompositions >>all_decompositions # Split it up into one file per unicode page rm *.d0 awk '{page=substr($1,1,length($1)-2); print>(page ".d0")}' all_decompositions # Reformat them as struct initialisers: for i in *.d0 do n=`basename $i .d0` echo -n "char_expansion_page_${n}_t to_ascii_alnums_page_${n} [] = {"; awk '{n=strtonum("0x" $1); a[n%256]=$2;} END {for(i=0;i<256;i++) { if (i in a) printf("%s, ",a[i]); else printf("\"\", "); } printf("};\n") }' $i done > to_ascii_alnums_tables.cc ######## # For conversion to ASCII for approximation of missing characters when recoding, we're # interested in any printable ASCII characters in the decomposition. awk '{printf("%s ",$1); for(i=2;i<=NF;i++) { n=strtonum("0x" $i); if (n>=32 && n<127) printf("%s ",$i) } printf("\n"); }' full_decompositions | awk '{if (NF>1) print}' | # Now convert the character codes to actual characters. awk '{printf("%s \"",$1); for(i=2;i<=NF;i++) { v=strtonum("0x" $i); printf("%c",v); } printf("\"\n"); }' | # Add 'identity' conversions for all (printable) ASCII characters awk 'BEGIN {for(i=32;i<127;i++) printf("%04X \"%c\"\n",i,i);} {print}' > all_decompositions # Add some more local decompositions. cat local_decompositions >>all_decompositions # Split it up into one file per unicode page rm *.d1 awk '{page=substr($1,1,length($1)-2); print>(page ".d1")}' all_decompositions # Reformat them as struct initialisers: for i in *.d1 do n=`basename $i .d1` echo -n "ascii_char_expansion_page_${n}_t to_ascii_page_${n} [] = {"; awk '{n=strtonum("0x" $1); a[n%256]=substr($0,6);} END {for(i=0;i<256;i++) { if (i in a) printf("%s, ",a[i]); else printf("\"\", "); } printf("};\n") }' $i done | # Munge " and \: sed 's/"\(["\]\)"/"\\\1"/g' > to_ascii_tables.cc