xenocara/app/xterm/charsets.c

779 lines
17 KiB
C

/* $XTermId: charsets.c,v 1.126 2024/05/22 00:27:53 tom Exp $ */
/*
* Copyright 1998-2023,2024 by Thomas E. Dickey
*
* All Rights Reserved
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Except as contained in this notice, the name(s) of the above copyright
* holders shall not be used in advertising or otherwise to promote the
* sale, use or other dealings in this Software without prior written
* authorization.
*
*/
#include <assert.h>
#include <X11/keysym.h>
#include <xterm.h>
#include <data.h>
#include <charsets.h>
#include <fontutils.h>
/*
* This module performs translation as needed to support the DEC VT220 national
* replacement character sets as well as supplementary character sets (aka
* code-pages) introduced in VT320, etc.
*
* We assume that xterm's font is based on the ISO 8859-1 (Latin 1) character
* set, which is almost the same as the DEC multinational character set. Glyph
* positions 0-31 have to be the DEC graphic characters, though.
*
* References:
* "VT220 Programmer Pocket Guide" EK-VT220-HR-002 (2nd ed., 1984), which
* contains character charts for the national character sets.
* "VT330/VT340 Programmer Reference Manual Volume 1: Text Programming"
* EK-VT3XX-TP-001 (1st ed, 1987), which contains a table (2-1)
* listing the glyphs which are mapped from the multinational
* character set to the national character set.
*
* The latter reference, though easier to read, has a few errors and omissions.
*/
#define HandleUPSS(charset) \
if (charset == nrc_DEC_UPSS) { \
charset = screen->gsets_upss; \
if (screen->vtXX_level >= 5) { \
/* EMPTY */ ; \
} else if (screen->vtXX_level >= 3) { \
if (charset != nrc_DEC_Supp) \
charset = nrc_ISO_Latin_1_Supp; \
} else if (screen->vtXX_level < 2) { \
charset = nrc_ASCII; \
} \
}
static Boolean
isSevenBit(DECNRCM_codes cs)
{
Boolean result = False;
switch (cs) {
case nrc_ISO_Greek_Supp:
case nrc_ISO_Hebrew_Supp:
case nrc_ISO_Latin_1_Supp:
case nrc_ISO_Latin_2_Supp:
case nrc_ISO_Latin_5_Supp:
case nrc_ISO_Latin_Cyrillic:
case nrc_DEC_UPSS:
break;
/* VT100 character sets */
case nrc_ASCII:
case nrc_British:
case nrc_DEC_Alt_Chars:
case nrc_DEC_Spec_Graphic:
/* VT220 character sets */
case nrc_DEC_Alt_Graphics:
case nrc_DEC_Supp:
/* VT320 character sets */
case nrc_DEC_Supp_Graphic:
case nrc_DEC_Technical:
/* NRCS character sets (VT320 to VT520) */
case nrc_British_Latin_1:
case nrc_Dutch:
case nrc_Finnish2:
case nrc_Finnish:
case nrc_French2:
case nrc_French:
case nrc_French_Canadian2:
case nrc_French_Canadian:
case nrc_German:
case nrc_Greek:
case nrc_Hebrew:
case nrc_Italian:
case nrc_Norwegian_Danish2:
case nrc_Norwegian_Danish3:
case nrc_Norwegian_Danish:
case nrc_Portugese:
case nrc_Russian:
case nrc_SCS_NRCS:
case nrc_Spanish:
case nrc_Swedish2:
case nrc_Swedish:
case nrc_Swiss:
case nrc_Turkish:
/* other DEC character sets */
case nrc_DEC_Cyrillic:
case nrc_DEC_Greek_Supp:
case nrc_DEC_Hebrew_Supp:
case nrc_DEC_Turkish_Supp:
result = True;
break;
case nrc_Unknown:
break;
}
return result;
}
/*
* Translate an input keysym to the corresponding NRC keysym.
*/
unsigned
xtermCharSetIn(XtermWidget xw, unsigned code, DECNRCM_codes charset)
{
TScreen *screen = TScreenOf(xw);
#define MAP(to, from) case from: code = to; break;
#if OPT_WIDE_CHARS
#define UNI(to, from) case from: if (screen->utf8_nrc_mode) code = to; break;
#else
#define UNI(to, from) case from: break;
#endif
#define XXX(to, from) /* no defined mapping to 0..255 */
TRACE(("CHARSET-IN GL=%s(G%d) GR=%s(G%d) SS%d\n\t%s\n",
visibleScsCode(screen->gsets[screen->curgl]), screen->curgl,
visibleScsCode(screen->gsets[screen->curgr]), screen->curgr,
screen->curss,
visibleUChar(code)));
HandleUPSS(charset);
switch (charset) {
case nrc_British: /* United Kingdom set (or Latin 1) */
if (code == XK_sterling)
code = 0x23;
code &= 0x7f;
break;
case nrc_DEC_Alt_Chars:
case nrc_DEC_Alt_Graphics:
case nrc_ASCII:
break;
case nrc_DEC_Spec_Graphic:
break;
case nrc_DEC_Supp:
map_DEC_Supp_Graphic(code, code &= 0x7f);
break;
case nrc_DEC_Supp_Graphic:
map_DEC_Supp_Graphic(code, code |= 0x80);
break;
case nrc_DEC_Technical:
map_DEC_Technical(code);
break;
case nrc_Dutch:
map_NRCS_Dutch(code);
break;
case nrc_Finnish:
case nrc_Finnish2:
map_NRCS_Finnish(code);
break;
case nrc_French:
case nrc_French2:
map_NRCS_French(code);
break;
case nrc_French_Canadian:
map_NRCS_French_Canadian(code);
break;
case nrc_German:
map_NRCS_German(code);
break;
case nrc_Greek:
map_NRCS_Greek(code); /* FIXME - ELOT? */
break;
case nrc_DEC_Greek_Supp:
map_DEC_Greek_Supp(code);
break;
case nrc_ISO_Greek_Supp:
map_ISO_Greek_Supp(code);
break;
case nrc_DEC_Hebrew_Supp:
map_DEC_Hebrew_Supp(code);
break;
case nrc_Hebrew:
map_NRCS_Hebrew(code);
break;
case nrc_ISO_Hebrew_Supp:
map_ISO_Hebrew(code);
break;
case nrc_Italian:
map_NRCS_Italian(code);
break;
case nrc_ISO_Latin_2_Supp:
map_ISO_Latin_2(code);
break;
case nrc_ISO_Latin_5_Supp:
map_ISO_Latin_5(code);
break;
case nrc_ISO_Latin_Cyrillic:
map_ISO_Latin_Cyrillic(code);
break;
case nrc_Norwegian_Danish:
case nrc_Norwegian_Danish2:
case nrc_Norwegian_Danish3:
map_NRCS_Norwegian_Danish(code);
break;
case nrc_Portugese:
map_NRCS_Portuguese(code);
break;
case nrc_SCS_NRCS: /* vt5xx - Serbo/Croatian */
/* FIXME */
break;
case nrc_Spanish:
map_NRCS_Spanish(code);
break;
case nrc_Swedish2:
case nrc_Swedish:
map_NRCS_Swedish(code);
break;
case nrc_Swiss:
map_NRCS_Swiss(code);
break;
case nrc_Turkish:
map_NRCS_Turkish(code);
break;
case nrc_DEC_Turkish_Supp:
map_DEC_Turkish_Supp(code);
break;
case nrc_DEC_Cyrillic:
map_DEC_Cyrillic(code);
break;
case nrc_ISO_Latin_1_Supp:
case nrc_British_Latin_1:
case nrc_Russian:
case nrc_French_Canadian2:
case nrc_Unknown:
case nrc_DEC_UPSS:
default: /* any character sets we don't recognize */
break;
}
code &= 0x7f; /* NRC in any case is 7-bit */
TRACE(("->\t%s\n",
visibleUChar(code)));
return code;
#undef MAP
#undef UNI
#undef XXX
}
/*
* Translate a string to the display form. This assumes the font has the
* DEC graphic characters in cells 0-31, and otherwise is ISO-8859-1.
*/
Cardinal
xtermCharSetOut(XtermWidget xw, Cardinal length, DECNRCM_codes leftset)
{
IChar *buf = xw->work.write_text;
IChar *ptr = buf + length;
IChar *s;
TScreen *screen = TScreenOf(xw);
Cardinal count = 0;
DECNRCM_codes rightset = screen->gsets[(int) (screen->curgr)];
#if OPT_DEC_RECTOPS
int sums = 0;
#endif
#define MAP(from, to) case from: chr = to; break;
#if OPT_WIDE_CHARS
#define UNI(from, to) case from: if (screen->utf8_nrc_mode) chr = to; break;
#define XXX(from, to) UNI(from, to)
#else
#define UNI(old, new) case new: chr = old; break;
#define XXX(from, to) /* nothing */
#endif
TRACE(("CHARSET-OUT GL=%s(G%d) GR=%s(G%d) SS%d\n\t%s\n",
visibleScsCode(leftset), screen->curgl,
visibleScsCode(rightset), screen->curgr,
screen->curss,
visibleIChars(buf, (size_t) length)));
assert(length != 0);
#if OPT_DEC_RECTOPS
if (length != 0 && length > xw->work.sizeof_sums) {
xw->work.sizeof_sums += length + 80;
xw->work.buffer_sums = realloc(xw->work.buffer_sums,
xw->work.sizeof_sums);
xw->work.buffer_sets = realloc(xw->work.buffer_sets,
xw->work.sizeof_sums);
}
xw->work.write_sums = xw->work.buffer_sums;
#endif
for (s = buf; s < ptr; ++s) {
int eight = CharOf(E2A(*s));
int seven = eight & 0x7f;
DECNRCM_codes cs = (eight >= 128) ? rightset : leftset;
int chr = eight;
HandleUPSS(cs);
#if OPT_DEC_RECTOPS
if (xw->work.buffer_sums != NULL && xw->work.buffer_sets != NULL) {
xw->work.buffer_sums[sums] = (Char) ((eight < 32 || eight > 255)
? ANSI_ESC
: eight);
xw->work.buffer_sets[sums] = cs;
++sums;
}
#endif
count++;
#if OPT_WIDE_CHARS
/*
* This is only partly right - prevent inadvertent remapping of
* the replacement character and other non-8bit codes into bogus
* 8bit codes.
*/
if (screen->utf8_mode || screen->utf8_nrc_mode) {
if (*s > 255)
continue;
}
#endif
if (*s < 32)
continue;
switch (cs) {
case nrc_DEC_UPSS:
break;
case nrc_ISO_Latin_1_Supp:
case nrc_British_Latin_1:
case nrc_British: /* United Kingdom set (or Latin 1) */
if ((xw->flags & NATIONAL)
|| (screen->vtXX_level <= 1)) {
if ((xw->flags & NATIONAL)) {
chr = seven;
}
if (chr == 0x23) {
chr = XTERM_POUND;
#if OPT_WIDE_CHARS
if (screen->utf8_nrc_mode) {
chr = 0xa3;
}
#endif
}
}
break;
case nrc_DEC_Alt_Chars:
case nrc_DEC_Alt_Graphics:
case nrc_ASCII:
break;
case nrc_DEC_Spec_Graphic:
if (seven > 0x5f && seven <= 0x7e) {
#if OPT_WIDE_CHARS
if (screen->utf8_mode || screen->utf8_nrc_mode)
chr = (int) dec2ucs(screen, (unsigned) (seven - 0x5f));
else
#endif
chr = seven - 0x5f;
} else if (chr == 0x5f) {
chr = 0;
} else {
chr = seven;
}
break;
case nrc_DEC_Supp:
case nrc_DEC_Supp_Graphic:
map_DEC_Supp_Graphic(chr = seven, chr = eight);
break;
case nrc_DEC_Technical:
map_DEC_Technical(chr = seven);
break;
case nrc_Dutch:
map_NRCS_Dutch(chr = seven);
break;
case nrc_Finnish:
case nrc_Finnish2:
map_NRCS_Finnish(chr = seven);
break;
case nrc_French:
case nrc_French2:
map_NRCS_French(chr = seven);
break;
case nrc_French_Canadian:
case nrc_French_Canadian2:
map_NRCS_French_Canadian(chr = seven);
break;
case nrc_German:
map_NRCS_German(chr = seven);
break;
case nrc_Greek:
map_NRCS_Greek(chr = seven); /* FIXME - ELOT? */
break;
case nrc_DEC_Greek_Supp:
map_DEC_Greek_Supp(chr = seven);
break;
case nrc_ISO_Greek_Supp:
map_ISO_Greek_Supp(chr = seven);
break;
case nrc_DEC_Hebrew_Supp:
map_DEC_Hebrew_Supp(chr = seven);
break;
case nrc_Hebrew:
map_NRCS_Hebrew(chr = seven);
break;
case nrc_ISO_Hebrew_Supp:
map_ISO_Hebrew(chr = seven);
break;
case nrc_Italian:
map_NRCS_Italian(chr = seven);
break;
case nrc_ISO_Latin_2_Supp:
map_ISO_Latin_2(chr = seven);
break;
case nrc_ISO_Latin_5_Supp:
map_ISO_Latin_5(chr = seven);
break;
case nrc_ISO_Latin_Cyrillic:
map_ISO_Latin_Cyrillic(chr = seven);
break;
case nrc_Norwegian_Danish:
case nrc_Norwegian_Danish2:
case nrc_Norwegian_Danish3:
map_NRCS_Norwegian_Danish(chr = seven);
break;
case nrc_Portugese:
map_NRCS_Portuguese(chr = seven);
break;
case nrc_SCS_NRCS: /* vt5xx - Serbo/Croatian */
/* FIXME */
break;
case nrc_Spanish:
map_NRCS_Spanish(chr = seven);
break;
case nrc_Swedish2:
case nrc_Swedish:
map_NRCS_Swedish(chr = seven);
break;
case nrc_Swiss:
map_NRCS_Swiss(chr = seven);
break;
case nrc_Turkish:
map_NRCS_Turkish(chr = seven);
break;
case nrc_DEC_Turkish_Supp:
map_DEC_Turkish_Supp(chr = seven);
break;
case nrc_DEC_Cyrillic:
map_DEC_Cyrillic(chr = seven);
break;
case nrc_Russian:
case nrc_Unknown:
default: /* any character sets we don't recognize */
break;
}
/*
* The state machine already treated DEL as a nonprinting and
* nonspacing character. If we have DEL now, remove it.
*/
if (chr == ANSI_DEL && isSevenBit(cs)) {
IChar *s1;
--ptr;
for (s1 = s; s1 < ptr; ++s1) {
s1[0] = s1[1];
}
--count;
#if OPT_DEC_RECTOPS
--sums;
#endif
} else {
if (eight >= 128 && chr < 128 && chr > 32)
chr |= 128;
*s = (IChar) A2E(chr);
}
}
TRACE(("%d\t%s\n",
count,
visibleIChars(buf, (size_t) length)));
return count;
#undef MAP
#undef UNI
#undef XXX
}
#if OPT_DEC_RECTOPS
/*
* Given a mapped character, e.g., a Unicode value returned by xtermCharSetIn,
* match it against the current GL/GR selection and return the corresponding
* DEC internal character-set code for DECRQCRA.
*
* A hardware terminal presumably stores the original and mapped characters,
* as well as the character set which was selected at that time Doing that
* in xterm adds a couple of bytes to every cell.
*/
int
xtermCharSetDec(XtermWidget xw, IChar chr, DECNRCM_codes cs)
{
#define MAP(from, to) case from: result = to; break;
#define DFTMAP() result = (actual | 0x80)
#define DFT_94(chr) result = ((actual) & 0x7f)
#define DFT_96(chr) result = ((actual) | 0x80)
#if OPT_WIDE_CHARS
#define UNI(from, to) case from: if (screen->utf8_nrc_mode) result = to; break;
#define XXX(from, to) UNI(from, to)
#else
#define UNI(old, new) case new: result = old; break;
#define XXX(from, to) /* nothing */
#endif
int result;
if (chr < 0x20
#if OPT_WIDE_CHARS
|| chr > 0xff
#endif
) {
result = ANSI_ESC;
} else {
Boolean isSeven = isSevenBit(cs);
TScreen *screen = TScreenOf(xw);
result = -1;
HandleUPSS(cs);
if (chr == 0xa0 && isSeven) {
result = ANSI_ESC;
} else if (chr == ANSI_SPA && isSeven) {
result = ANSI_SPA;
} else if ((chr == ANSI_DEL || chr == 0xff) && isSeven) {
result = 0;
} else {
int actual = (int) chr;
chr &= 0x7f;
switch (cs) {
case nrc_DEC_Alt_Chars:
case nrc_DEC_Alt_Graphics:
case nrc_ASCII:
result = (int) chr;
break;
case nrc_British:
if (chr >= 0xa0 && chr < 0xff) {
if (chr == 0x23)
chr = 0xA3;
result = (int) chr;
}
break;
case nrc_DEC_Cyrillic:
unmap_DEC_Cyrillic(chr, DFT_94(chr));
break;
case nrc_DEC_Spec_Graphic:
unmap_DEC_Spec_Graphic(chr, DFT_94(chr));
break;
case nrc_DEC_Supp:
/* FALLTHRU */
case nrc_DEC_Supp_Graphic:
unmap_DEC_Supp_Graphic(chr, DFTMAP());
break;
case nrc_DEC_Technical:
unmap_DEC_Technical(chr, DFTMAP());
break;
case nrc_Dutch:
unmap_NRCS_Dutch(chr, DFT_94(chr));
break;
case nrc_Finnish:
case nrc_Finnish2:
unmap_NRCS_Finnish(chr, DFT_94(chr));
break;
case nrc_French:
case nrc_French2:
unmap_NRCS_French(chr, DFT_94(chr));
break;
case nrc_French_Canadian:
case nrc_French_Canadian2:
unmap_NRCS_French_Canadian(chr, DFT_94(chr));
break;
case nrc_German:
unmap_NRCS_German(chr, DFT_94(chr));
break;
case nrc_Greek:
unmap_NRCS_Greek(chr, DFT_94(chr));
break;
case nrc_DEC_Greek_Supp:
unmap_DEC_Greek_Supp(chr, DFTMAP());
break;
case nrc_ISO_Greek_Supp:
unmap_ISO_Greek_Supp(chr, DFTMAP());
break;
case nrc_DEC_Hebrew_Supp:
unmap_DEC_Hebrew_Supp(chr, DFTMAP());
break;
case nrc_Hebrew:
unmap_NRCS_Hebrew(chr, DFT_94(chr));
break;
case nrc_ISO_Hebrew_Supp:
unmap_ISO_Hebrew(chr, DFTMAP());
break;
case nrc_Italian:
unmap_NRCS_Italian(chr, DFT_94(chr));
break;
case nrc_ISO_Latin_1_Supp:
unmap_ISO_Latin_1(chr, DFTMAP());
break;
case nrc_ISO_Latin_2_Supp:
unmap_ISO_Latin_2(chr, DFTMAP());
break;
case nrc_ISO_Latin_5_Supp:
unmap_ISO_Latin_5(chr, DFTMAP());
break;
case nrc_ISO_Latin_Cyrillic:
unmap_ISO_Latin_Cyrillic(chr, DFTMAP());
break;
case nrc_Norwegian_Danish:
case nrc_Norwegian_Danish2:
case nrc_Norwegian_Danish3:
unmap_NRCS_Norwegian_Danish(chr, DFT_94(chr));
break;
case nrc_Portugese:
unmap_NRCS_Portuguese(chr, DFT_94(chr));
break;
case nrc_Spanish:
unmap_NRCS_Spanish(chr, DFT_94(chr));
break;
case nrc_Swedish:
case nrc_Swedish2:
unmap_NRCS_Swedish(chr, DFT_94(chr));
break;
case nrc_Swiss:
unmap_NRCS_Swiss(chr, DFT_94(chr));
break;
case nrc_DEC_Turkish_Supp:
unmap_DEC_Turkish_Supp(chr, DFTMAP());
break;
case nrc_Turkish:
unmap_NRCS_Turkish(chr, DFT_94(chr));
break;
case nrc_British_Latin_1:
case nrc_SCS_NRCS:
case nrc_Russian:
case nrc_Unknown:
case nrc_DEC_UPSS:
default: /* anything we cannot unmap */
break;
}
if (result < 0) {
if (isSeven) {
DFT_94(chr);
} else {
DFT_96(chr);
}
}
}
}
return result;
#undef MAP
#undef UNI
#undef XXX
}
#endif /* OPT_DEC_RECTOPS */