/* This is a program for transliterating messages composed in KOI-8 or Alternate GOST Cyrillic character sets, and for converting them between KOI-8 and Alt. So-called "code 2" (encoding system similar to KOI-8) and "code 3" (transliteration system frequently used in English-language newspapers) are also supported. KOI-8 is a character set popular with the users of UNIX systems; Alt is used by MS DOS users. The codes of Roman (English) characters in these character sets are the same, but Cyrillic characters are represented differently. You can use this program to convert a Cyrillic mail message or text file received from a user of a system using different character set. If your system can not display Cyrillic characters at all, you can transliterate the message. COMPILING. This is the file "xli.c". You can compile it with any good C or C++ compiler you have: tcc xli.c gcc xli.c etc. If your compiler generates error messages because it does not accept ANSI style function definitions, then you should not use that compiler in the first place. But if you have to, try to replace 1 with 0 in the "#define ANSI_COMPILER 1" line in the beginning of the program, and recompile. USAGE. Assuming that you named the executable program you obtained "xli", and that you want to convert/transliterate text file foo.txt and produce new.txt, you can do xli a k foo.txt new.txt ;; to convert from Alt to KOI-8 xli k a foo.txt new.txt ;; to convert from KOI-8 to Alt xli k x foo.txt new.txt ;; to transliterate a KOI-8 file xli a x foo.txt new.txt ;; to transliterate an Alt file xli a 3 foo.txt new.txt ;; to transliterate an Alt file into code 3 etc. If you omit the destination file name (new.txt), the output will go to the standard output, i.e. to the screen of your system or to whatever filter program (more, grep, etc.) you use. BUGS. Letter "Yo" is not supported. (I don't know its code.) PROBLEMS. Report problems to vmenkov@cs.indiana.edu, 1-812-857-9331 (C) Vladimir Menkov, 1992, 1994. Free distribution encouraged. */ #include #include /* If your compiler does not understand ANSI function definitions, replace 1 with 0 in the line below: */ #define ANSI_COMPILER 1 typedef unsigned char uchar; /* Encoding formats: */ typedef enum { xli, alt, koi, code2, code3, mswin } coding; /* Transliteration table. "Convenient" transliteration */ static uchar sh2xli [0x40][5] = { /* lower case */ "a", "b", "v", "g", "d", "e", "zh","z", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "f", "h", "c", "ch", "sh","shch", "''", "y", "'", "e'", "ju", "ja", /* upper case */ "A", "B", "V", "G", "D", "E", "Zh","Z", "I", "J", "K", "L", "M", "N", "O", "P", "R", "S", "T", "U", "F", "H", "C", "Ch", "Sh","Shch", "''", "Y", "'", "E'", "Ju", "Ja" }; /* Newspaper-style transliteration, a.k.a code 3 */ static uchar sh2code3 [0x40][5] = { /* lower case */ "a", "b", "v", "g", "d", "e", "zh","z", "i", "y", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "f", "kh", "ts", "ch", "sh","shch", "'", "y", "'", "e", "yu", "ya", /* upper case */ "A", "B", "V", "G", "D", "E", "Zh","Z", "I", "Y", "K", "L", "M", "N", "O", "P", "R", "S", "T", "U", "F", "Kh", "Ts", "Ch", "Sh","Shch", "'", "Y", "'", "E", "Yu", "Ya" }; /* Code 2 */ static unsigned char sh2code2 [0x41] = "abwgdevzijklmnoprstufhc4|}_yx{~qabwgdevzijklmnoprstufhc4|}_yx{~q"; static uchar sh2alt [0x40] = { /* lowercase */ 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, /* uppercase */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F }; /* KOI-8 */ static uchar sh2koi [0x40] = { /* lowercase */ 0301, 0302, 0327, 0307, 0304, 0305, 0326, 0332, 0311, 0312, 0313, 0314, 0315, 0316, 0317, 0320, 0322, 0323, 0324, 0325, 0306, 0310, 0303, 0336, 0333, 0335, 0337, 0331, 0330, 0334, 0300, 0321, /* uppercase; */ 0341, 0342, 0367, 0347, 0344, 0345, 0366, 0372, 0351, 0352, 0353, 0354, 0355, 0356, 0357, 0360, 0362, 0363, 0364, 0365, 0346, 0350, 0343, 0376, 0373, 0375, 0377, 0371, 0370, 0374, 0340, 0361 }; /* Microsoft Windows */ static uchar sh2mswin [0x40] = { /* lowercase */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* uppercase; */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf }; /* Needs fixing, as it's extremely inefficient */ #if (ANSI_COMPILER) int koi2sh( uchar c) #else int koi2sh(c) uchar c; #endif { int i; for( i=0; i < 0x40; i++) if (sh2koi[i] == c) return i; return -1; } /* Needs fixing, as it's extremely inefficient */ #if (ANSI_COMPILER) int code22sh( uchar c) #else int code22sh(c) uchar c; #endif { int i; for( i=0x20; i < 0x40; i++) if (sh2code2[i] == c) return i; return -1; } #if (ANSI_COMPILER) int alt2sh( uchar c) #else int alt2sh(c) uchar c; #endif { int sh = -1; if ( c >= 0x80 && c < 0xA0) sh = c - 0x80 + 0x20; else if ( c>= 0xA0 && c < 0xB0) sh = c - 0xA0; else if ( c>= 0xE0 && c < 0xF0) sh = c - 0xE0 + 0x10; return sh; } #if (ANSI_COMPILER) int mswin2sh( unsigned int c) #else int mswin2sh(c) unsigned int c; #endif { int sh = -1; if ( c >= 0xC0 && c < 0xE0) sh = c - 0xC0 + 0x20; else if ( c>= 0xE0 && c < 0x100) sh = c - 0xE0; return sh; } static int char2sh[ 0x100 ]; /* Fills the table char2sh */ #if (ANSI_COMPILER) void make_table( coding source ) #else void make_table( source ) coding source; #endif { int c; for( c = 0; c < 0x100; c ++) { if ( source == alt ) char2sh[ c ] = alt2sh( c ); if ( source == mswin ) char2sh[ c ] = mswin2sh( c ); else if (source == koi) char2sh[ c ] = koi2sh( c ); else if (source == code2) char2sh[ c ] = code22sh( c ); } } #if (ANSI_COMPILER) void Decode(FILE *f, FILE *g, coding target) #else void Decode(f,g, target) FILE *f, *g; coding target; #endif { int c, sh; while( c = getc(f) , c != EOF ) { sh = char2sh[ c ]; if ( sh== -1) putc( c, g) ; else { if (target == xli) fprintf( g, "%s", sh2xli[sh]); else if (target == code3) fprintf( g, "%s", sh2code3[sh]); else if (target == koi) putc( sh2koi[sh], g); else if (target == code2) putc( sh2code2[sh], g); else if (target == alt) putc( sh2alt[sh], g); else if (target == mswin) putc( sh2mswin[sh], g); } } } static char * progname; void usage() { printf( "\nUsage: %s input-file [output-file]", progname); printf("\nFormat is any of the a,k,x,w,2,3:"); printf("\n%s\n%s", "\n a : Alt ASCII; k : KOI-8; x : transliteration; w: MS Win;", "2: code 2; 3: code 3"); printf("\nFor example: %s a x foo.txt", progname); printf( "\nto transliterate `foo.txt' composed in Alt encoding on MSDOS machine"); printf("\n"); exit(1); } #if (ANSI_COMPILER) coding get_format( char * s) #else coding get_format(s) char * s; #endif { if (s[1]) usage(); else if (s[0] == 'a') return alt; else if (s[0] == 'k') return koi; else if (s[0] == 'x') return xli; else if (s[0] == 'w') return mswin; else if (s[0] == '2') return code2; else if (s[0] == '3') return code3; else usage(); return xli; /* Just to prevent warning */ } #if (ANSI_COMPILER) int main(int argc, char * argv[]) #else int main(argc, argv) int argc; char * argv[]; #endif { FILE *f, *g; coding source, target; progname = argv[0]; if (argc < 3 || argc> 5) usage(); source = get_format( argv[1]); target = get_format( argv[2]); if (source == xli || source == code3) { fprintf(stderr, "\nCan't convert FROM transliterated format!\n"); exit(1); } if (argc > 3) { f = fopen( argv[3], "rt"); if (!f) { printf("\nCan't read from `%s'", argv[3] ); exit(1); } } else f = stdin; if (argc> 4) { g = fopen( argv[4], "wt"); if (!g) { printf("\nCan't write to `%s'", argv[4] ); exit(1); } } else g = stdout; make_table( source ); Decode(f,g, target); fclose(f); fclose(g); return 0; }