sync code with last improvements from OpenBSD
This commit is contained in:
parent
f0c5a45f3a
commit
6dffc8ab2a
28 changed files with 2476 additions and 1648 deletions
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: lapic.c,v 1.70 2023/09/14 19:39:47 cheloha Exp $ */
|
||||
/* $OpenBSD: lapic.c,v 1.71 2023/09/17 14:50:50 cheloha Exp $ */
|
||||
/* $NetBSD: lapic.c,v 1.2 2003/05/08 01:04:35 fvdl Exp $ */
|
||||
|
||||
/*-
|
||||
|
@ -499,7 +499,6 @@ lapic_initclocks(void)
|
|||
stathz = hz;
|
||||
profhz = stathz * 10;
|
||||
statclock_is_randomized = 1;
|
||||
clockintr_init(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: clock.c,v 1.41 2023/08/23 01:55:46 cheloha Exp $ */
|
||||
/* $OpenBSD: clock.c,v 1.42 2023/09/17 14:50:50 cheloha Exp $ */
|
||||
/* $NetBSD: clock.c,v 1.1 2003/04/26 18:39:50 fvdl Exp $ */
|
||||
|
||||
/*-
|
||||
|
@ -283,7 +283,6 @@ i8254_initclocks(void)
|
|||
|
||||
stathz = 128;
|
||||
profhz = 1024; /* XXX does not divide into 1 billion */
|
||||
clockintr_init(0);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: agtimer.c,v 1.20 2023/09/14 19:39:47 cheloha Exp $ */
|
||||
/* $OpenBSD: agtimer.c,v 1.21 2023/09/17 14:50:51 cheloha Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2011 Dale Rahn <drahn@openbsd.org>
|
||||
* Copyright (c) 2013 Patrick Wildt <patrick@blueri.se>
|
||||
|
@ -231,7 +231,6 @@ agtimer_cpu_initclocks(void)
|
|||
stathz = hz;
|
||||
profhz = stathz * 10;
|
||||
statclock_is_randomized = 1;
|
||||
clockintr_init(0);
|
||||
|
||||
if (sc->sc_ticks_per_second != agtimer_frequency) {
|
||||
agtimer_set_clockrate(agtimer_frequency);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: amptimer.c,v 1.19 2023/09/14 19:39:47 cheloha Exp $ */
|
||||
/* $OpenBSD: amptimer.c,v 1.20 2023/09/17 14:50:51 cheloha Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2011 Dale Rahn <drahn@openbsd.org>
|
||||
*
|
||||
|
@ -288,7 +288,6 @@ amptimer_cpu_initclocks(void)
|
|||
stathz = hz;
|
||||
profhz = hz * 10;
|
||||
statclock_is_randomized = 1;
|
||||
clockintr_init(0);
|
||||
|
||||
if (sc->sc_ticks_per_second != amptimer_frequency) {
|
||||
amptimer_set_clockrate(amptimer_frequency);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: acpipci.c,v 1.40 2023/09/12 08:32:58 jmatthew Exp $ */
|
||||
/* $OpenBSD: acpipci.c,v 1.41 2023/09/16 23:25:16 jmatthew Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2018 Mark Kettenis
|
||||
*
|
||||
|
@ -844,7 +844,8 @@ acpipci_iort_map(struct acpi_iort *iort, uint32_t offset, uint32_t id,
|
|||
itsn = (struct acpi_iort_its_node *)&node[1];
|
||||
LIST_FOREACH(icl, &interrupt_controllers, ic_list) {
|
||||
for (i = 0; i < itsn->number_of_itss; i++) {
|
||||
if (icl->ic_gic_its_id == itsn->its_ids[i]) {
|
||||
if (icl->ic_establish_msi != NULL &&
|
||||
icl->ic_gic_its_id == itsn->its_ids[i]) {
|
||||
*ic = icl;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: agtimer.c,v 1.27 2023/09/14 19:39:47 cheloha Exp $ */
|
||||
/* $OpenBSD: agtimer.c,v 1.28 2023/09/17 14:50:51 cheloha Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2011 Dale Rahn <drahn@openbsd.org>
|
||||
* Copyright (c) 2013 Patrick Wildt <patrick@blueri.se>
|
||||
|
@ -294,7 +294,6 @@ agtimer_cpu_initclocks(void)
|
|||
stathz = hz;
|
||||
profhz = stathz * 10;
|
||||
statclock_is_randomized = 1;
|
||||
clockintr_init(0);
|
||||
|
||||
if (sc->sc_ticks_per_second != agtimer_frequency) {
|
||||
agtimer_set_clockrate(agtimer_frequency);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: dmtimer.c,v 1.21 2023/09/14 19:39:47 cheloha Exp $ */
|
||||
/* $OpenBSD: dmtimer.c,v 1.22 2023/09/17 14:50:51 cheloha Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2007,2009 Dale Rahn <drahn@openbsd.org>
|
||||
* Copyright (c) 2013 Raphael Graf <r@undefined.ch>
|
||||
|
@ -233,7 +233,6 @@ dmtimer_cpu_initclocks(void)
|
|||
stathz = hz;
|
||||
profhz = stathz * 10;
|
||||
statclock_is_randomized = 1;
|
||||
clockintr_init(0);
|
||||
|
||||
sc->sc_ticks_per_second = TIMER_FREQUENCY; /* 32768 */
|
||||
sc->sc_nsec_cycle_ratio =
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: gptimer.c,v 1.22 2023/09/14 19:39:47 cheloha Exp $ */
|
||||
/* $OpenBSD: gptimer.c,v 1.23 2023/09/17 14:50:51 cheloha Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2007,2009 Dale Rahn <drahn@openbsd.org>
|
||||
*
|
||||
|
@ -199,7 +199,6 @@ gptimer_cpu_initclocks(void)
|
|||
stathz = hz;
|
||||
profhz = stathz * 10;
|
||||
statclock_is_randomized = 1;
|
||||
clockintr_init(0);
|
||||
|
||||
gptimer_nsec_cycle_ratio = TIMER_FREQUENCY * (1ULL << 32) / 1000000000;
|
||||
gptimer_nsec_max = UINT64_MAX / gptimer_nsec_cycle_ratio;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: sxitimer.c,v 1.23 2023/09/14 19:39:47 cheloha Exp $ */
|
||||
/* $OpenBSD: sxitimer.c,v 1.24 2023/09/17 14:50:51 cheloha Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2007,2009 Dale Rahn <drahn@openbsd.org>
|
||||
* Copyright (c) 2013 Raphael Graf <r@undefined.ch>
|
||||
|
@ -181,7 +181,6 @@ sxitimer_attach(struct device *parent, struct device *self, void *aux)
|
|||
stathz = hz;
|
||||
profhz = stathz * 10;
|
||||
statclock_is_randomized = 1;
|
||||
clockintr_init(0);
|
||||
|
||||
/* stop timer, and set clk src */
|
||||
bus_space_write_4(sxitimer_iot, sxitimer_ioh,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: lapic.c,v 1.57 2023/09/14 19:39:48 cheloha Exp $ */
|
||||
/* $OpenBSD: lapic.c,v 1.58 2023/09/17 14:50:51 cheloha Exp $ */
|
||||
/* $NetBSD: lapic.c,v 1.1.2.8 2000/02/23 06:10:50 sommerfeld Exp $ */
|
||||
|
||||
/*-
|
||||
|
@ -327,7 +327,6 @@ lapic_initclocks(void)
|
|||
stathz = hz;
|
||||
profhz = stathz * 10;
|
||||
statclock_is_randomized = 1;
|
||||
clockintr_init(0);
|
||||
}
|
||||
|
||||
extern int gettick(void); /* XXX put in header file */
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: clock.c,v 1.67 2023/08/23 01:55:46 cheloha Exp $ */
|
||||
/* $OpenBSD: clock.c,v 1.68 2023/09/17 14:50:51 cheloha Exp $ */
|
||||
/* $NetBSD: clock.c,v 1.39 1996/05/12 23:11:54 mycroft Exp $ */
|
||||
|
||||
/*-
|
||||
|
@ -426,7 +426,6 @@ i8254_initclocks(void)
|
|||
|
||||
stathz = 128;
|
||||
profhz = 1024; /* XXX does not divide into 1 billion */
|
||||
clockintr_init(0);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: kern_clockintr.c,v 1.53 2023/09/15 11:48:49 deraadt Exp $ */
|
||||
/* $OpenBSD: kern_clockintr.c,v 1.56 2023/09/17 15:24:35 cheloha Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
|
||||
* Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
|
||||
|
@ -31,13 +31,6 @@
|
|||
#include <sys/sysctl.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
/*
|
||||
* Protection for global variables in this file:
|
||||
*
|
||||
* I Immutable after initialization.
|
||||
*/
|
||||
uint32_t clockintr_flags; /* [I] global state + behavior flags */
|
||||
|
||||
void clockintr_hardclock(struct clockintr *, void *, void *);
|
||||
void clockintr_schedule(struct clockintr *, uint64_t);
|
||||
void clockintr_schedule_locked(struct clockintr *, uint64_t);
|
||||
|
@ -50,19 +43,6 @@ void clockqueue_pend_insert(struct clockintr_queue *, struct clockintr *,
|
|||
void clockqueue_reset_intrclock(struct clockintr_queue *);
|
||||
uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
|
||||
|
||||
/*
|
||||
* Initialize global state. Set flags and compute intervals.
|
||||
*/
|
||||
void
|
||||
clockintr_init(uint32_t flags)
|
||||
{
|
||||
KASSERT(CPU_IS_PRIMARY(curcpu()));
|
||||
KASSERT(clockintr_flags == 0);
|
||||
KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
|
||||
|
||||
SET(clockintr_flags, flags | CL_INIT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ready the calling CPU for clockintr_dispatch(). If this is our
|
||||
* first time here, install the intrclock, if any, and set necessary
|
||||
|
@ -77,8 +57,6 @@ clockintr_cpu_init(const struct intrclock *ic)
|
|||
struct schedstate_percpu *spc = &ci->ci_schedstate;
|
||||
int reset_cq_intrclock = 0;
|
||||
|
||||
KASSERT(ISSET(clockintr_flags, CL_INIT));
|
||||
|
||||
if (ic != NULL)
|
||||
clockqueue_intrclock_install(cq, ic);
|
||||
|
||||
|
@ -355,10 +333,9 @@ clockintr_cancel(struct clockintr *cl)
|
|||
}
|
||||
|
||||
struct clockintr *
|
||||
clockintr_establish(void *vci,
|
||||
clockintr_establish(struct cpu_info *ci,
|
||||
void (*func)(struct clockintr *, void *, void *), void *arg)
|
||||
{
|
||||
struct cpu_info *ci = vci;
|
||||
struct clockintr *cl;
|
||||
struct clockintr_queue *cq = &ci->ci_queue;
|
||||
|
||||
|
@ -370,7 +347,7 @@ clockintr_establish(void *vci,
|
|||
cl->cl_queue = cq;
|
||||
|
||||
mtx_enter(&cq->cq_mtx);
|
||||
TAILQ_INSERT_TAIL(&cq->cq_est, cl, cl_elink);
|
||||
TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink);
|
||||
mtx_leave(&cq->cq_mtx);
|
||||
return cl;
|
||||
}
|
||||
|
@ -443,7 +420,7 @@ clockqueue_init(struct clockintr_queue *cq)
|
|||
|
||||
cq->cq_shadow.cl_queue = cq;
|
||||
mtx_init(&cq->cq_mtx, IPL_CLOCK);
|
||||
TAILQ_INIT(&cq->cq_est);
|
||||
TAILQ_INIT(&cq->cq_all);
|
||||
TAILQ_INIT(&cq->cq_pend);
|
||||
cq->cq_gen = 1;
|
||||
SET(cq->cq_flags, CQ_INIT);
|
||||
|
@ -623,7 +600,7 @@ db_show_clockintr_cpu(struct cpu_info *ci)
|
|||
db_show_clockintr(cq->cq_running, "run", cpu);
|
||||
TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink)
|
||||
db_show_clockintr(elm, "pend", cpu);
|
||||
TAILQ_FOREACH(elm, &cq->cq_est, cl_elink) {
|
||||
TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) {
|
||||
if (!ISSET(elm->cl_flags, CLST_PENDING))
|
||||
db_show_clockintr(elm, "idle", cpu);
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: sched_bsd.c,v 1.86 2023/09/10 03:08:05 cheloha Exp $ */
|
||||
/* $OpenBSD: sched_bsd.c,v 1.87 2023/09/17 13:02:24 cheloha Exp $ */
|
||||
/* $NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $ */
|
||||
|
||||
/*-
|
||||
|
@ -117,9 +117,9 @@ roundrobin(struct clockintr *cl, void *cf, void *arg)
|
|||
* 1, 5, and 15 minute intervals.
|
||||
*/
|
||||
void
|
||||
update_loadavg(void *arg)
|
||||
update_loadavg(void *unused)
|
||||
{
|
||||
struct timeout *to = (struct timeout *)arg;
|
||||
static struct timeout to = TIMEOUT_INITIALIZER(update_loadavg, NULL);
|
||||
CPU_INFO_ITERATOR cii;
|
||||
struct cpu_info *ci;
|
||||
u_int i, nrun = 0;
|
||||
|
@ -135,7 +135,7 @@ update_loadavg(void *arg)
|
|||
nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
|
||||
}
|
||||
|
||||
timeout_add_sec(to, 5);
|
||||
timeout_add_sec(&to, 5);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -227,9 +227,9 @@ fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
|
|||
* Recompute process priorities, every second.
|
||||
*/
|
||||
void
|
||||
schedcpu(void *arg)
|
||||
schedcpu(void *unused)
|
||||
{
|
||||
struct timeout *to = (struct timeout *)arg;
|
||||
static struct timeout to = TIMEOUT_INITIALIZER(schedcpu, NULL);
|
||||
fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
|
||||
struct proc *p;
|
||||
int s;
|
||||
|
@ -280,7 +280,7 @@ schedcpu(void *arg)
|
|||
SCHED_UNLOCK(s);
|
||||
}
|
||||
wakeup(&lbolt);
|
||||
timeout_add_sec(to, 1);
|
||||
timeout_add_sec(&to, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -726,23 +726,14 @@ sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
|
|||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Start the scheduler's periodic timeouts.
|
||||
*/
|
||||
void
|
||||
scheduler_start(void)
|
||||
{
|
||||
static struct timeout schedcpu_to;
|
||||
static struct timeout loadavg_to;
|
||||
|
||||
/*
|
||||
* We avoid polluting the global namespace by keeping the scheduler
|
||||
* timeouts static in this function.
|
||||
* We setup the timeout here and kick schedcpu once to make it do
|
||||
* its job.
|
||||
*/
|
||||
timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
|
||||
timeout_set(&loadavg_to, update_loadavg, &loadavg_to);
|
||||
|
||||
schedcpu(&schedcpu_to);
|
||||
update_loadavg(&loadavg_to);
|
||||
schedcpu(NULL);
|
||||
update_loadavg(NULL);
|
||||
|
||||
#ifndef SMALL_KERNEL
|
||||
if (perfpolicy == PERFPOL_AUTO)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: clockintr.h,v 1.17 2023/09/15 11:48:48 deraadt Exp $ */
|
||||
/* $OpenBSD: clockintr.h,v 1.20 2023/09/17 15:24:35 cheloha Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org>
|
||||
*
|
||||
|
@ -35,6 +35,8 @@ struct clockintr_stat {
|
|||
#include <sys/mutex.h>
|
||||
#include <sys/queue.h>
|
||||
|
||||
struct cpu_info;
|
||||
|
||||
/*
|
||||
* Platform API
|
||||
*/
|
||||
|
@ -68,7 +70,7 @@ intrclock_trigger(struct intrclock *ic)
|
|||
struct clockintr_queue;
|
||||
struct clockintr {
|
||||
uint64_t cl_expiration; /* [m] dispatch time */
|
||||
TAILQ_ENTRY(clockintr) cl_elink; /* [m] cq_est glue */
|
||||
TAILQ_ENTRY(clockintr) cl_alink; /* [m] cq_all glue */
|
||||
TAILQ_ENTRY(clockintr) cl_plink; /* [m] cq_pend glue */
|
||||
void *cl_arg; /* [I] argument */
|
||||
void (*cl_func)(struct clockintr *, void *, void *); /* [I] callback */
|
||||
|
@ -94,7 +96,7 @@ struct clockintr_queue {
|
|||
struct clockintr cq_shadow; /* [o] copy of running clockintr */
|
||||
struct mutex cq_mtx; /* [a] per-queue mutex */
|
||||
uint64_t cq_uptime; /* [o] cached uptime */
|
||||
TAILQ_HEAD(, clockintr) cq_est; /* [m] established clockintr list */
|
||||
TAILQ_HEAD(, clockintr) cq_all; /* [m] established clockintr list */
|
||||
TAILQ_HEAD(, clockintr) cq_pend;/* [m] pending clockintr list */
|
||||
struct clockintr *cq_running; /* [m] running clockintr */
|
||||
struct clockintr *cq_hardclock; /* [o] hardclock handle */
|
||||
|
@ -109,16 +111,8 @@ struct clockintr_queue {
|
|||
#define CQ_INTRCLOCK 0x00000002 /* intrclock installed */
|
||||
#define CQ_STATE_MASK 0x00000003
|
||||
|
||||
/* Global state flags. */
|
||||
#define CL_INIT 0x00000001 /* global init done */
|
||||
#define CL_STATE_MASK 0x00000001
|
||||
|
||||
/* Global behavior flags. */
|
||||
#define CL_FLAG_MASK 0x00000000
|
||||
|
||||
void clockintr_cpu_init(const struct intrclock *);
|
||||
int clockintr_dispatch(void *);
|
||||
void clockintr_init(uint32_t);
|
||||
void clockintr_trigger(void);
|
||||
|
||||
/*
|
||||
|
@ -128,7 +122,7 @@ void clockintr_trigger(void);
|
|||
uint64_t clockintr_advance(struct clockintr *, uint64_t);
|
||||
uint64_t clockintr_advance_random(struct clockintr *, uint64_t, uint32_t);
|
||||
void clockintr_cancel(struct clockintr *);
|
||||
struct clockintr *clockintr_establish(void *,
|
||||
struct clockintr *clockintr_establish(struct cpu_info *,
|
||||
void (*)(struct clockintr *, void *, void *), void *);
|
||||
void clockintr_stagger(struct clockintr *, uint64_t, uint32_t, uint32_t);
|
||||
void clockqueue_init(struct clockintr_queue *);
|
||||
|
|
1417
usr.bin/awk/FIXES
1417
usr.bin/awk/FIXES
File diff suppressed because it is too large
Load diff
1429
usr.bin/awk/FIXES.1e
Normal file
1429
usr.bin/awk/FIXES.1e
Normal file
File diff suppressed because it is too large
Load diff
23
usr.bin/awk/LICENSE
Normal file
23
usr.bin/awk/LICENSE
Normal file
|
@ -0,0 +1,23 @@
|
|||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
||||
Permission to use, copy, modify, and distribute this software and
|
||||
its documentation for any purpose and without fee is hereby
|
||||
granted, provided that the above copyright notice appear in all
|
||||
copies and that both that the copyright notice and this
|
||||
permission notice and warranty disclaimer appear in supporting
|
||||
documentation, and that the name Lucent Technologies or any of
|
||||
its entities not be used in advertising or publicity pertaining
|
||||
to distribution of the software without specific, written prior
|
||||
permission.
|
||||
|
||||
LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
|
||||
INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
|
||||
IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
|
||||
SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
|
||||
IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
|
||||
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
|
||||
THIS SOFTWARE.
|
||||
****************************************************************/
|
|
@ -1,10 +1,40 @@
|
|||
$OpenBSD: README.md,v 1.6 2022/01/27 16:58:37 millert Exp $
|
||||
$OpenBSD: README.md,v 1.7 2023/09/17 14:49:44 millert Exp $
|
||||
|
||||
# The One True Awk
|
||||
|
||||
This is the version of `awk` described in _The AWK Programming Language_,
|
||||
by Al Aho, Brian Kernighan, and Peter Weinberger
|
||||
(Addison-Wesley, 1988, ISBN 0-201-07981-X).
|
||||
Second Edition, by Al Aho, Brian Kernighan, and Peter Weinberger
|
||||
(Addison-Wesley, 2024, ISBN-13 978-0138269722, ISBN-10 0138269726).
|
||||
|
||||
## What's New? ##
|
||||
|
||||
This version of Awk handles UTF-8 and comma-separated values (CSV) input.
|
||||
|
||||
### Strings ###
|
||||
|
||||
Functions that process strings now count Unicode code points, not bytes;
|
||||
this affects `length`, `substr`, `index`, `match`, `split`,
|
||||
`sub`, `gsub`, and others. Note that code
|
||||
points are not necessarily characters.
|
||||
|
||||
UTF-8 sequences may appear in literal strings and regular expressions.
|
||||
Aribtrary characters may be included with `\u` followed by 1 to 8 hexadecimal digits.
|
||||
|
||||
### Regular expressions ###
|
||||
|
||||
Regular expressions may include UTF-8 code points, including `\u`.
|
||||
Character classes are likely to be limited to about 256 characters
|
||||
when expanded.
|
||||
|
||||
### CSV ###
|
||||
|
||||
The option `--csv` turns on CSV processing of input:
|
||||
fields are separated by commas, fields may be quoted with
|
||||
double-quote (`"`) characters, fields may contain embedded newlines.
|
||||
In CSV mode, `FS` is ignored.
|
||||
|
||||
If no explicit separator argument is provided,
|
||||
field-splitting in `split` is determined by CSV mode.
|
||||
|
||||
## Copyright
|
||||
|
||||
|
@ -69,22 +99,22 @@ The program itself is created by
|
|||
|
||||
which should produce a sequence of messages roughly like this:
|
||||
|
||||
yacc -d awkgram.y
|
||||
conflicts: 43 shift/reduce, 85 reduce/reduce
|
||||
mv y.tab.c ytab.c
|
||||
mv y.tab.h ytab.h
|
||||
cc -c ytab.c
|
||||
cc -c b.c
|
||||
cc -c main.c
|
||||
cc -c parse.c
|
||||
cc maketab.c -o maketab
|
||||
./maketab >proctab.c
|
||||
cc -c proctab.c
|
||||
cc -c tran.c
|
||||
cc -c lib.c
|
||||
cc -c run.c
|
||||
cc -c lex.c
|
||||
cc ytab.o b.o main.o parse.o proctab.o tran.o lib.o run.o lex.o -lm
|
||||
bison -d awkgram.y
|
||||
awkgram.y: warning: 44 shift/reduce conflicts [-Wconflicts-sr]
|
||||
awkgram.y: warning: 85 reduce/reduce conflicts [-Wconflicts-rr]
|
||||
awkgram.y: note: rerun with option '-Wcounterexamples' to generate conflict counterexamples
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o awkgram.tab.o awkgram.tab.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o b.o b.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o main.o main.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o parse.o parse.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 maketab.c -o maketab
|
||||
./maketab awkgram.tab.h >proctab.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o proctab.o proctab.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o tran.o tran.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o lib.o lib.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o run.o run.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o lex.o lex.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 awkgram.tab.o b.o main.o parse.o proctab.o tran.o lib.o run.o lex.o -lm
|
||||
|
||||
This produces an executable `a.out`; you will eventually want to
|
||||
move this to some place like `/usr/bin/awk`.
|
||||
|
@ -104,11 +134,6 @@ the standard developer tools.
|
|||
You can also use `make CC=g++` to build with the GNU C++ compiler,
|
||||
should you choose to do so.
|
||||
|
||||
The version of `malloc` that comes with some systems is sometimes
|
||||
astonishly slow. If `awk` seems slow, you might try fixing that.
|
||||
More generally, turning on optimization can significantly improve
|
||||
`awk`'s speed, perhaps by 1/3 for highest levels.
|
||||
|
||||
## A Note About Releases
|
||||
|
||||
We don't usually do releases.
|
||||
|
@ -122,5 +147,4 @@ is not at the top of our priority list.
|
|||
|
||||
#### Last Updated
|
||||
|
||||
Sun 23 Jan 2022 03:48:01 PM EST
|
||||
|
||||
Sun Sep 3 09:26:43 EDT 2023
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.\" $OpenBSD: awk.1,v 1.64 2023/09/15 15:07:08 jsg Exp $
|
||||
.\" $OpenBSD: awk.1,v 1.65 2023/09/17 14:49:44 millert Exp $
|
||||
.\"
|
||||
.\" Copyright (C) Lucent Technologies 1997
|
||||
.\" All Rights Reserved
|
||||
|
@ -22,7 +22,7 @@
|
|||
.\" ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
|
||||
.\" THIS SOFTWARE.
|
||||
.\"
|
||||
.Dd $Mdocdate: September 15 2023 $
|
||||
.Dd $Mdocdate: September 17 2023 $
|
||||
.Dt AWK 1
|
||||
.Os
|
||||
.Sh NAME
|
||||
|
@ -33,7 +33,7 @@
|
|||
.Op Fl safe
|
||||
.Op Fl V
|
||||
.Op Fl d Ns Op Ar n
|
||||
.Op Fl F Ar fs
|
||||
.Op Fl F Ar fs | Fl -csv
|
||||
.Op Fl v Ar var Ns = Ns Ar value
|
||||
.Op Ar prog | Fl f Ar progfile
|
||||
.Ar
|
||||
|
@ -64,6 +64,14 @@ and is executed at the time it would have been opened if it were a filename.
|
|||
.Pp
|
||||
The options are as follows:
|
||||
.Bl -tag -width "-safe "
|
||||
.It Fl -csv
|
||||
Process records using the (more or less) standard comma-separated values
|
||||
.Pq CSV
|
||||
format instead of the input field separator.
|
||||
When the
|
||||
.Fl -csv
|
||||
option is specified, attempts to change the input field separator
|
||||
or record separator are ignored.
|
||||
.It Fl d Ns Op Ar n
|
||||
Debug mode.
|
||||
Set debug level to
|
||||
|
@ -1058,4 +1066,5 @@ to it.
|
|||
The scope rules for variables in functions are a botch;
|
||||
the syntax is worse.
|
||||
.Pp
|
||||
Only eight-bit character sets are handled correctly.
|
||||
Input is expected to be UTF-8 encoded.
|
||||
Other multibyte character sets are not handled.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: awk.h,v 1.28 2022/09/01 15:21:28 millert Exp $ */
|
||||
/* $OpenBSD: awk.h,v 1.29 2023/09/17 14:49:44 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
@ -80,6 +80,8 @@ extern char **SUBSEP;
|
|||
extern Awkfloat *RSTART;
|
||||
extern Awkfloat *RLENGTH;
|
||||
|
||||
extern bool CSV; /* true for csv input */
|
||||
|
||||
extern char *record; /* points to $0 */
|
||||
extern int lineno; /* line number in awk program */
|
||||
extern int errorflag; /* 1 if error has occurred */
|
||||
|
@ -236,7 +238,8 @@ extern int pairstack[], paircnt;
|
|||
|
||||
/* structures used by regular expression matching machinery, mostly b.c: */
|
||||
|
||||
#define NCHARS (256+3) /* 256 handles 8-bit chars; 128 does 7-bit */
|
||||
#define NCHARS (1256+3) /* 256 handles 8-bit chars; 128 does 7-bit */
|
||||
/* BUG: some overflows (caught) if we use 256 */
|
||||
/* watch out in match(), etc. */
|
||||
#define HAT (NCHARS+2) /* matches ^ in regular expr */
|
||||
#define NSTATES 32
|
||||
|
@ -247,12 +250,19 @@ typedef struct rrow {
|
|||
int i;
|
||||
Node *np;
|
||||
uschar *up;
|
||||
int *rp; /* rune representation of char class */
|
||||
} lval; /* because Al stores a pointer in it! */
|
||||
int *lfollow;
|
||||
} rrow;
|
||||
|
||||
typedef struct gtt { /* gototab entry */
|
||||
unsigned int ch;
|
||||
unsigned int state;
|
||||
} gtt;
|
||||
|
||||
typedef struct fa {
|
||||
unsigned int **gototab;
|
||||
gtt **gototab;
|
||||
int gototab_len;
|
||||
uschar *out;
|
||||
uschar *restr;
|
||||
int **posns;
|
||||
|
|
267
usr.bin/awk/b.c
267
usr.bin/awk/b.c
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: b.c,v 1.37 2021/07/08 21:26:39 millert Exp $ */
|
||||
/* $OpenBSD: b.c,v 1.38 2023/09/17 14:49:44 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
@ -81,6 +81,41 @@ int patlen;
|
|||
fa *fatab[NFA];
|
||||
int nfatab = 0; /* entries in fatab */
|
||||
|
||||
|
||||
/* utf-8 mechanism:
|
||||
|
||||
For most of Awk, utf-8 strings just "work", since they look like
|
||||
null-terminated sequences of 8-bit bytes.
|
||||
|
||||
Functions like length(), index(), and substr() have to operate
|
||||
in units of utf-8 characters. The u8_* functions in run.c
|
||||
handle this.
|
||||
|
||||
Regular expressions are more complicated, since the basic
|
||||
mechanism of the goto table used 8-bit byte indices into the
|
||||
gototab entries to compute the next state. Unicode is a lot
|
||||
bigger, so the gototab entries are now structs with a character
|
||||
and a next state, and there is a linear search of the characters
|
||||
to find the state. (Yes, this is slower, by a significant
|
||||
amount. Tough.)
|
||||
|
||||
Throughout the RE mechanism in b.c, utf-8 characters are
|
||||
converted to their utf-32 value. This mostly shows up in
|
||||
cclenter, which expands character class ranges like a-z and now
|
||||
alpha-omega. The size of a gototab array is still about 256.
|
||||
This should be dynamic, but for now things work ok for a single
|
||||
code page of Unicode, which is the most likely case.
|
||||
|
||||
The code changes are localized in run.c and b.c. I have added a
|
||||
handful of functions to somewhat better hide the implementation,
|
||||
but a lot more could be done.
|
||||
|
||||
*/
|
||||
|
||||
static int get_gototab(fa*, int, int);
|
||||
static int set_gototab(fa*, int, int, int);
|
||||
extern int u8_rune(int *, const uschar *);
|
||||
|
||||
static int *
|
||||
intalloc(size_t n, const char *f)
|
||||
{
|
||||
|
@ -113,7 +148,7 @@ resizesetvec(const char *f)
|
|||
static void
|
||||
resize_state(fa *f, int state)
|
||||
{
|
||||
unsigned int **p;
|
||||
gtt **p;
|
||||
uschar *p2;
|
||||
int **p3;
|
||||
int i, new_count;
|
||||
|
@ -123,7 +158,7 @@ resize_state(fa *f, int state)
|
|||
|
||||
new_count = state + 10; /* needs to be tuned */
|
||||
|
||||
p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
|
||||
p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
|
||||
if (p == NULL)
|
||||
goto out;
|
||||
f->gototab = p;
|
||||
|
@ -139,12 +174,13 @@ resize_state(fa *f, int state)
|
|||
f->posns = p3;
|
||||
|
||||
for (i = f->state_count; i < new_count; ++i) {
|
||||
f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
|
||||
f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
|
||||
if (f->gototab[i] == NULL)
|
||||
goto out;
|
||||
f->out[i] = 0;
|
||||
f->posns[i] = NULL;
|
||||
}
|
||||
f->gototab_len = NCHARS; /* should be variable, growable */
|
||||
f->state_count = new_count;
|
||||
return;
|
||||
out:
|
||||
|
@ -239,7 +275,7 @@ int makeinit(fa *f, bool anchor)
|
|||
if ((f->posns[2])[1] == f->accept)
|
||||
f->out[2] = 1;
|
||||
for (i = 0; i < NCHARS; i++)
|
||||
f->gototab[2][i] = 0;
|
||||
set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */
|
||||
f->curstat = cgoto(f, 2, HAT);
|
||||
if (anchor) {
|
||||
*f->posns[2] = k-1; /* leave out position 0 */
|
||||
|
@ -308,13 +344,13 @@ void freetr(Node *p) /* free parse tree */
|
|||
/* in the parsing of regular expressions, metacharacters like . have */
|
||||
/* to be seen literally; \056 is not a metacharacter. */
|
||||
|
||||
int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
|
||||
int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
|
||||
{ /* only pick up one 8-bit byte (2 chars) */
|
||||
const uschar *p;
|
||||
int n = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
|
||||
for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
|
||||
if (isdigit(*p))
|
||||
n = 16 * n + *p - '0';
|
||||
else if (*p >= 'a' && *p <= 'f')
|
||||
|
@ -334,24 +370,28 @@ int quoted(const uschar **pp) /* pick up next thing after a \\ */
|
|||
const uschar *p = *pp;
|
||||
int c;
|
||||
|
||||
if ((c = *p++) == 't')
|
||||
/* BUG: should advance by utf-8 char even if makes no sense */
|
||||
|
||||
if ((c = *p++) == 't') {
|
||||
c = '\t';
|
||||
else if (c == 'n')
|
||||
} else if (c == 'n') {
|
||||
c = '\n';
|
||||
else if (c == 'f')
|
||||
} else if (c == 'f') {
|
||||
c = '\f';
|
||||
else if (c == 'r')
|
||||
} else if (c == 'r') {
|
||||
c = '\r';
|
||||
else if (c == 'b')
|
||||
} else if (c == 'b') {
|
||||
c = '\b';
|
||||
else if (c == 'v')
|
||||
} else if (c == 'v') {
|
||||
c = '\v';
|
||||
else if (c == 'a')
|
||||
} else if (c == 'a') {
|
||||
c = '\a';
|
||||
else if (c == '\\')
|
||||
} else if (c == '\\') {
|
||||
c = '\\';
|
||||
else if (c == 'x') { /* hexadecimal goo follows */
|
||||
c = hexstr(&p); /* this adds a null if number is invalid */
|
||||
} else if (c == 'x') { /* 2 hex digits follow */
|
||||
c = hexstr(&p, 2); /* this adds a null if number is invalid */
|
||||
} else if (c == 'u') { /* unicode char number up to 8 hex digits */
|
||||
c = hexstr(&p, 8);
|
||||
} else if (isoctdigit(c)) { /* \d \dd \ddd */
|
||||
int n = c - '0';
|
||||
if (isoctdigit(*p)) {
|
||||
|
@ -366,50 +406,67 @@ int quoted(const uschar **pp) /* pick up next thing after a \\ */
|
|||
return c;
|
||||
}
|
||||
|
||||
char *cclenter(const char *argp) /* add a character class */
|
||||
int *cclenter(const char *argp) /* add a character class */
|
||||
{
|
||||
int i, c, c2;
|
||||
const uschar *op, *p = (const uschar *) argp;
|
||||
uschar *bp;
|
||||
static uschar *buf = NULL;
|
||||
int n;
|
||||
const uschar *p = (const uschar *) argp;
|
||||
int *bp, *retp;
|
||||
static int *buf = NULL;
|
||||
static int bufsz = 100;
|
||||
|
||||
op = p;
|
||||
if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
|
||||
if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
|
||||
FATAL("out of space for character class [%.10s...] 1", p);
|
||||
bp = buf;
|
||||
for (i = 0; (c = *p++) != 0; ) {
|
||||
for (i = 0; *p != 0; ) {
|
||||
n = u8_rune(&c, p);
|
||||
p += n;
|
||||
if (c == '\\') {
|
||||
c = quoted(&p);
|
||||
} else if (c == '-' && i > 0 && bp[-1] != 0) {
|
||||
if (*p != 0) {
|
||||
c = bp[-1];
|
||||
c2 = *p++;
|
||||
/* c2 = *p++; */
|
||||
n = u8_rune(&c2, p);
|
||||
p += n;
|
||||
if (c2 == '\\')
|
||||
c2 = quoted(&p);
|
||||
c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
|
||||
if (c > c2) { /* empty; ignore */
|
||||
bp--;
|
||||
i--;
|
||||
continue;
|
||||
}
|
||||
while (c < c2) {
|
||||
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
|
||||
if (i >= bufsz) {
|
||||
buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);
|
||||
if (buf == NULL)
|
||||
FATAL("out of space for character class [%.10s...] 2", p);
|
||||
bufsz *= 2;
|
||||
bp = buf + i;
|
||||
}
|
||||
*bp++ = ++c;
|
||||
i++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
|
||||
FATAL("out of space for character class [%.10s...] 3", p);
|
||||
if (i >= bufsz) {
|
||||
buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);
|
||||
if (buf == NULL)
|
||||
FATAL("out of space for character class [%.10s...] 2", p);
|
||||
bufsz *= 2;
|
||||
bp = buf + i;
|
||||
}
|
||||
*bp++ = c;
|
||||
i++;
|
||||
}
|
||||
*bp = 0;
|
||||
DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
|
||||
xfree(op);
|
||||
return (char *) tostring((char *) buf);
|
||||
/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
|
||||
/* xfree(op); BUG: what are we freeing here? */
|
||||
retp = (int *) calloc(bp-buf+1, sizeof(int));
|
||||
for (i = 0; i < bp-buf+1; i++)
|
||||
retp[i] = buf[i];
|
||||
return retp;
|
||||
}
|
||||
|
||||
void overflo(const char *s)
|
||||
|
@ -532,9 +589,9 @@ void follow(Node *v) /* collects leaves that can follow v into setvec */
|
|||
}
|
||||
}
|
||||
|
||||
int member(int c, const char *sarg) /* is c in s? */
|
||||
int member(int c, int *sarg) /* is c in s? */
|
||||
{
|
||||
const uschar *s = (const uschar *) sarg;
|
||||
int *s = (int *) sarg;
|
||||
|
||||
while (*s)
|
||||
if (c == *s++)
|
||||
|
@ -542,11 +599,41 @@ int member(int c, const char *sarg) /* is c in s? */
|
|||
return(0);
|
||||
}
|
||||
|
||||
static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < f->gototab_len; i++) {
|
||||
if (f->gototab[state][i].ch == 0)
|
||||
break;
|
||||
if (f->gototab[state][i].ch == ch)
|
||||
return f->gototab[state][i].state;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < f->gototab_len; i++) {
|
||||
if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
|
||||
f->gototab[state][i].ch = ch;
|
||||
f->gototab[state][i].state = val;
|
||||
return val;
|
||||
}
|
||||
}
|
||||
overflo(__func__);
|
||||
return val; /* not used anywhere at the moment */
|
||||
}
|
||||
|
||||
int match(fa *f, const char *p0) /* shortest match ? */
|
||||
{
|
||||
int s, ns;
|
||||
int n;
|
||||
int rune;
|
||||
const uschar *p = (const uschar *) p0;
|
||||
|
||||
/* return pmatch(f, p0); does it matter whether longest or shortest? */
|
||||
|
||||
s = f->initstat;
|
||||
assert (s < f->state_count);
|
||||
|
||||
|
@ -554,19 +641,25 @@ int match(fa *f, const char *p0) /* shortest match ? */
|
|||
return(1);
|
||||
do {
|
||||
/* assert(*p < NCHARS); */
|
||||
if ((ns = f->gototab[s][*p]) != 0)
|
||||
n = u8_rune(&rune, p);
|
||||
if ((ns = get_gototab(f, s, rune)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(f, s, *p);
|
||||
s = cgoto(f, s, rune);
|
||||
if (f->out[s])
|
||||
return(1);
|
||||
} while (*p++ != 0);
|
||||
if (*p == 0)
|
||||
break;
|
||||
p += n;
|
||||
} while (1); /* was *p++ != 0 */
|
||||
return(0);
|
||||
}
|
||||
|
||||
int pmatch(fa *f, const char *p0) /* longest match, for sub */
|
||||
{
|
||||
int s, ns;
|
||||
int n;
|
||||
int rune;
|
||||
const uschar *p = (const uschar *) p0;
|
||||
const uschar *q;
|
||||
|
||||
|
@ -581,10 +674,11 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
|
|||
if (f->out[s]) /* final state */
|
||||
patlen = q-p;
|
||||
/* assert(*q < NCHARS); */
|
||||
if ((ns = f->gototab[s][*q]) != 0)
|
||||
n = u8_rune(&rune, q);
|
||||
if ((ns = get_gototab(f, s, rune)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(f, s, *q);
|
||||
s = cgoto(f, s, rune);
|
||||
|
||||
assert(s < f->state_count);
|
||||
|
||||
|
@ -596,7 +690,11 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
|
|||
else
|
||||
goto nextin; /* no match */
|
||||
}
|
||||
} while (*q++ != 0);
|
||||
if (*q == 0)
|
||||
break;
|
||||
q += n;
|
||||
} while (1);
|
||||
q++; /* was *q++ */
|
||||
if (f->out[s])
|
||||
patlen = q-p-1; /* don't count $ */
|
||||
if (patlen >= 0) {
|
||||
|
@ -605,13 +703,19 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
|
|||
}
|
||||
nextin:
|
||||
s = 2;
|
||||
} while (*p++);
|
||||
if (*p == 0)
|
||||
break;
|
||||
n = u8_rune(&rune, p);
|
||||
p += n;
|
||||
} while (1); /* was *p++ */
|
||||
return (0);
|
||||
}
|
||||
|
||||
int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
||||
{
|
||||
int s, ns;
|
||||
int n;
|
||||
int rune;
|
||||
const uschar *p = (const uschar *) p0;
|
||||
const uschar *q;
|
||||
|
||||
|
@ -626,10 +730,11 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
|||
if (f->out[s]) /* final state */
|
||||
patlen = q-p;
|
||||
/* assert(*q < NCHARS); */
|
||||
if ((ns = f->gototab[s][*q]) != 0)
|
||||
n = u8_rune(&rune, q);
|
||||
if ((ns = get_gototab(f, s, rune)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(f, s, *q);
|
||||
s = cgoto(f, s, rune);
|
||||
if (s == 1) { /* no transition */
|
||||
if (patlen > 0) {
|
||||
patbeg = (const char *) p;
|
||||
|
@ -637,7 +742,11 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
|||
} else
|
||||
goto nnextin; /* no nonempty match */
|
||||
}
|
||||
} while (*q++ != 0);
|
||||
if (*q == 0)
|
||||
break;
|
||||
q += n;
|
||||
} while (1);
|
||||
q++;
|
||||
if (f->out[s])
|
||||
patlen = q-p-1; /* don't count $ */
|
||||
if (patlen > 0 ) {
|
||||
|
@ -651,6 +760,35 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
|||
return (0);
|
||||
}
|
||||
|
||||
static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum,
|
||||
int *curpos, int *lastpos)
|
||||
{
|
||||
int c = 0;
|
||||
char *buf = *pbuf;
|
||||
static const int max_bytes = 4; // max multiple bytes in UTF-8 is 4
|
||||
int i, rune;
|
||||
uschar private_buf[max_bytes + 1];
|
||||
|
||||
for (i = 0; i <= max_bytes; i++) {
|
||||
if (++*curpos == *lastpos) {
|
||||
if (*lastpos == *pbufsize)
|
||||
if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune"))
|
||||
FATAL("stream '%.30s...' too long", buf);
|
||||
buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0;
|
||||
private_buf[i] = c;
|
||||
}
|
||||
if (c == 0 || c < 128 || (c >> 6) == 4) { // 10xxxxxx starts a new character
|
||||
ungetc(c, fp);
|
||||
private_buf[i] = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
u8_rune(& rune, private_buf);
|
||||
|
||||
return rune;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* NAME
|
||||
|
@ -672,6 +810,7 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
|
|||
char *buf = *pbuf;
|
||||
int bufsize = *pbufsize;
|
||||
int c, i, j, k, ns, s;
|
||||
int rune;
|
||||
|
||||
s = pfa->initstat;
|
||||
patlen = 0;
|
||||
|
@ -695,12 +834,19 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
|
|||
buf[k++] = (c = getc(f)) != EOF ? c : 0;
|
||||
}
|
||||
c = (uschar)buf[j];
|
||||
/* assert(c < NCHARS); */
|
||||
if (c < 128)
|
||||
rune = c;
|
||||
else {
|
||||
j--;
|
||||
k--;
|
||||
ungetc(c, f);
|
||||
rune = getrune(f, &buf, &bufsize, quantum, &j, &k);
|
||||
}
|
||||
|
||||
if ((ns = pfa->gototab[s][c]) != 0)
|
||||
if ((ns = get_gototab(pfa, s, rune)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(pfa, s, c);
|
||||
s = cgoto(pfa, s, rune);
|
||||
|
||||
if (pfa->out[s]) { /* final state */
|
||||
patlen = j - i + 1;
|
||||
|
@ -1026,6 +1172,8 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
|
||||
|
||||
int relex(void) /* lexical analyzer for reparse */
|
||||
{
|
||||
int c, n;
|
||||
|
@ -1043,6 +1191,12 @@ int relex(void) /* lexical analyzer for reparse */
|
|||
rescan:
|
||||
starttok = prestr;
|
||||
|
||||
if ((n = u8_rune(&rlxval, prestr)) > 1) {
|
||||
prestr += n;
|
||||
starttok = prestr;
|
||||
return CHAR;
|
||||
}
|
||||
|
||||
switch (c = *prestr++) {
|
||||
case '|': return OR;
|
||||
case '*': return STAR;
|
||||
|
@ -1080,10 +1234,15 @@ rescan:
|
|||
}
|
||||
else
|
||||
cflag = 0;
|
||||
n = 2 * strlen((const char *) prestr)+1;
|
||||
n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
|
||||
if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
|
||||
FATAL("out of space for reg expr %.10s...", lastre);
|
||||
for (; ; ) {
|
||||
if ((n = u8_rune(&rlxval, prestr)) > 1) {
|
||||
for (i = 0; i < n; i++)
|
||||
*bp++ = *prestr++;
|
||||
continue;
|
||||
}
|
||||
if ((c = *prestr++) == '\\') {
|
||||
*bp++ = '\\';
|
||||
if ((c = *prestr++) == '\0')
|
||||
|
@ -1250,7 +1409,7 @@ int cgoto(fa *f, int s, int c)
|
|||
int *p, *q;
|
||||
int i, j, k;
|
||||
|
||||
assert(c == HAT || c < NCHARS);
|
||||
/* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
|
||||
while (f->accept >= maxsetvec) { /* guessing here! */
|
||||
resizesetvec(__func__);
|
||||
}
|
||||
|
@ -1266,8 +1425,8 @@ int cgoto(fa *f, int s, int c)
|
|||
|| (k == DOT && c != 0 && c != HAT)
|
||||
|| (k == ALL && c != 0)
|
||||
|| (k == EMPTYRE && c != 0)
|
||||
|| (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
|
||||
|| (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
|
||||
|| (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
|
||||
|| (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
|
||||
q = f->re[p[i]].lfollow;
|
||||
for (j = 1; j <= *q; j++) {
|
||||
if (q[j] >= maxsetvec) {
|
||||
|
@ -1299,7 +1458,7 @@ int cgoto(fa *f, int s, int c)
|
|||
goto different;
|
||||
/* setvec is state i */
|
||||
if (c != HAT)
|
||||
f->gototab[s][c] = i;
|
||||
set_gototab(f, s, c, i);
|
||||
return i;
|
||||
different:;
|
||||
}
|
||||
|
@ -1308,13 +1467,13 @@ int cgoto(fa *f, int s, int c)
|
|||
++(f->curstat);
|
||||
resize_state(f, f->curstat);
|
||||
for (i = 0; i < NCHARS; i++)
|
||||
f->gototab[f->curstat][i] = 0;
|
||||
set_gototab(f, f->curstat, 0, 0);
|
||||
xfree(f->posns[f->curstat]);
|
||||
p = intalloc(setcnt + 1, __func__);
|
||||
|
||||
f->posns[f->curstat] = p;
|
||||
if (c != HAT)
|
||||
f->gototab[s][c] = f->curstat;
|
||||
set_gototab(f, s, c, f->curstat);
|
||||
for (i = 0; i <= setcnt; i++)
|
||||
p[i] = tmpset[i];
|
||||
if (setvec[f->accept])
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: lex.c,v 1.30 2023/09/10 14:59:00 millert Exp $ */
|
||||
/* $OpenBSD: lex.c,v 1.31 2023/09/17 14:49:44 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
@ -379,6 +379,8 @@ int yylex(void)
|
|||
}
|
||||
}
|
||||
|
||||
extern int runetochar(char *str, int c);
|
||||
|
||||
int string(void)
|
||||
{
|
||||
int c, n;
|
||||
|
@ -426,7 +428,7 @@ int string(void)
|
|||
*bp++ = n;
|
||||
break;
|
||||
|
||||
case 'x': /* hex \x0-9a-fA-F + */
|
||||
case 'x': /* hex \x0-9a-fA-F (exactly two) */
|
||||
{
|
||||
int i;
|
||||
|
||||
|
@ -452,6 +454,27 @@ int string(void)
|
|||
break;
|
||||
}
|
||||
|
||||
case 'u': /* utf \u0-9a-fA-F (1..8) */
|
||||
{
|
||||
int i;
|
||||
|
||||
n = 0;
|
||||
for (i = 0; i < 8; i++) {
|
||||
c = input();
|
||||
if (!isxdigit(c) || c == 0)
|
||||
break;
|
||||
c = tolower(c);
|
||||
n *= 16;
|
||||
if (isdigit(c))
|
||||
n += (c - '0');
|
||||
else
|
||||
n += 10 + (c - 'a');
|
||||
}
|
||||
unput(c);
|
||||
bp += runetochar(bp, n);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
*bp++ = c;
|
||||
break;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: lib.c,v 1.50 2023/09/10 14:59:00 millert Exp $ */
|
||||
/* $OpenBSD: lib.c,v 1.51 2023/09/17 14:49:44 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
@ -34,6 +34,8 @@ THIS SOFTWARE.
|
|||
#include <math.h>
|
||||
#include "awk.h"
|
||||
|
||||
extern int u8_nextlen(const char *s);
|
||||
|
||||
char EMPTY[] = { '\0' };
|
||||
FILE *infile = NULL;
|
||||
bool innew; /* true = infile has not been read by readrec */
|
||||
|
@ -217,14 +219,19 @@ void nextfile(void)
|
|||
argno++;
|
||||
}
|
||||
|
||||
extern int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag);
|
||||
|
||||
int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one record into buf */
|
||||
{
|
||||
int sep, c, isrec;
|
||||
char *rr, *buf = *pbuf;
|
||||
int sep, c, isrec; // POTENTIAL BUG? isrec is a macro in awk.h
|
||||
char *rr = *pbuf, *buf = *pbuf;
|
||||
int bufsize = *pbufsize;
|
||||
char *rs = getsval(rsloc);
|
||||
|
||||
if (*rs && rs[1]) {
|
||||
if (CSV) {
|
||||
c = readcsvrec(pbuf, pbufsize, inf, newflag);
|
||||
isrec = (c == EOF && rr == buf) ? false : true;
|
||||
} else if (*rs && rs[1]) {
|
||||
bool found;
|
||||
|
||||
fa *pfa = makedfa(rs, 1);
|
||||
|
@ -276,6 +283,51 @@ int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one rec
|
|||
return isrec;
|
||||
}
|
||||
|
||||
|
||||
/*******************
|
||||
* loose ends here:
|
||||
* \r\n should become \n
|
||||
* what about bare \r? Excel uses that for embedded newlines
|
||||
* can't have "" in unquoted fields, according to RFC 4180
|
||||
*/
|
||||
|
||||
int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* csv can have \n's */
|
||||
{ /* so read a complete record that might be multiple lines */
|
||||
int sep, c;
|
||||
char *rr = *pbuf, *buf = *pbuf;
|
||||
int bufsize = *pbufsize;
|
||||
bool in_quote = false;
|
||||
|
||||
sep = '\n'; /* the only separator; have to skip over \n embedded in "..." */
|
||||
rr = buf;
|
||||
while ((c = getc(inf)) != EOF) {
|
||||
if (c == sep) {
|
||||
if (! in_quote)
|
||||
break;
|
||||
if (rr > buf && rr[-1] == '\r') // remove \r if was \r\n
|
||||
rr--;
|
||||
}
|
||||
|
||||
if (rr-buf+1 > bufsize)
|
||||
if (!adjbuf(&buf, &bufsize, 1+rr-buf,
|
||||
recsize, &rr, "readcsvrec 1"))
|
||||
FATAL("input record `%.30s...' too long", buf);
|
||||
*rr++ = c;
|
||||
if (c == '"')
|
||||
in_quote = ! in_quote;
|
||||
}
|
||||
if (c == '\n' && rr > buf && rr[-1] == '\r') // remove \r if was \r\n
|
||||
rr--;
|
||||
|
||||
if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readcsvrec 4"))
|
||||
FATAL("input record `%.30s...' too long", buf);
|
||||
*rr = 0;
|
||||
*pbuf = buf;
|
||||
*pbufsize = bufsize;
|
||||
DPRINTF("readcsvrec saw <%s>, returns %d\n", buf, c);
|
||||
return c;
|
||||
}
|
||||
|
||||
char *getargv(int n) /* get ARGV[n] */
|
||||
{
|
||||
Cell *x;
|
||||
|
@ -297,6 +349,9 @@ void setclvar(char *s) /* set var=value from s */
|
|||
Cell *q;
|
||||
double result;
|
||||
|
||||
/* commit f3d9187d4e0f02294fb1b0e31152070506314e67 broke T.argv test */
|
||||
/* I don't understand why it was changed. */
|
||||
|
||||
for (p=s; *p != '='; p++)
|
||||
;
|
||||
e = p;
|
||||
|
@ -341,7 +396,7 @@ void fldbld(void) /* create fields from current record */
|
|||
savefs();
|
||||
if (strlen(inputFS) > 1) { /* it's a regular expression */
|
||||
i = refldbld(r, inputFS);
|
||||
} else if ((sep = *inputFS) == ' ') { /* default whitespace */
|
||||
} else if (!CSV && (sep = *inputFS) == ' ') { /* default whitespace */
|
||||
for (i = 0; ; ) {
|
||||
while (*r == ' ' || *r == '\t' || *r == '\n')
|
||||
r++;
|
||||
|
@ -360,26 +415,58 @@ void fldbld(void) /* create fields from current record */
|
|||
*fr++ = 0;
|
||||
}
|
||||
*fr = 0;
|
||||
} else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */
|
||||
for (i = 0; *r != '\0'; r += n) {
|
||||
char buf[MB_LEN_MAX + 1];
|
||||
|
||||
} else if (CSV) { /* CSV processing. no error handling */
|
||||
if (*r != 0) {
|
||||
for (;;) {
|
||||
i++;
|
||||
if (i > nfields)
|
||||
growfldtab(i);
|
||||
if (freeable(fldtab[i]))
|
||||
xfree(fldtab[i]->sval);
|
||||
n = mblen(r, MB_LEN_MAX);
|
||||
if (n < 0)
|
||||
n = 1;
|
||||
memcpy(buf, r, n);
|
||||
buf[n] = '\0';
|
||||
fldtab[i]->sval = fr;
|
||||
fldtab[i]->tval = FLD | STR | DONTFREE;
|
||||
if (*r == '"' ) { /* start of "..." */
|
||||
for (r++ ; *r != '\0'; ) {
|
||||
if (*r == '"' && r[1] != '\0' && r[1] == '"') {
|
||||
r += 2; /* doubled quote */
|
||||
*fr++ = '"';
|
||||
} else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) {
|
||||
r++; /* skip over closing quote */
|
||||
break;
|
||||
} else {
|
||||
*fr++ = *r++;
|
||||
}
|
||||
}
|
||||
*fr++ = 0;
|
||||
} else { /* unquoted field */
|
||||
while (*r != ',' && *r != '\0')
|
||||
*fr++ = *r++;
|
||||
*fr++ = 0;
|
||||
}
|
||||
if (*r++ == 0)
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
*fr = 0;
|
||||
} else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */
|
||||
for (i = 0; *r != '\0'; ) {
|
||||
char buf[10];
|
||||
i++;
|
||||
if (i > nfields)
|
||||
growfldtab(i);
|
||||
if (freeable(fldtab[i]))
|
||||
xfree(fldtab[i]->sval);
|
||||
n = u8_nextlen(r);
|
||||
for (j = 0; j < n; j++)
|
||||
buf[j] = *r++;
|
||||
buf[j] = '\0';
|
||||
fldtab[i]->sval = tostring(buf);
|
||||
fldtab[i]->tval = FLD | STR;
|
||||
}
|
||||
*fr = 0;
|
||||
} else if (*r != 0) { /* if 0, it's a null field */
|
||||
/* subtlecase : if length(FS) == 1 && length(RS > 0)
|
||||
/* subtle case: if length(FS) == 1 && length(RS > 0)
|
||||
* \n is NOT a field separator (cf awk book 61,84).
|
||||
* this variable is tested in the inner while loop.
|
||||
*/
|
||||
|
@ -797,11 +884,11 @@ bool is_valid_number(const char *s, bool trailing_stuff_ok,
|
|||
while (isspace((uschar)*s))
|
||||
s++;
|
||||
|
||||
// no hex floating point, sorry
|
||||
/* no hex floating point, sorry */
|
||||
if (s[0] == '0' && tolower((uschar)s[1]) == 'x')
|
||||
return false;
|
||||
|
||||
// allow +nan, -nan, +inf, -inf, any other letter, no
|
||||
/* allow +nan, -nan, +inf, -inf, any other letter, no */
|
||||
if (s[0] == '+' || s[0] == '-') {
|
||||
is_nan = (strncasecmp(s+1, "nan", 3) == 0);
|
||||
is_inf = (strncasecmp(s+1, "inf", 3) == 0);
|
||||
|
@ -835,7 +922,7 @@ convert:
|
|||
if (no_trailing != NULL)
|
||||
*no_trailing = (*ep == '\0');
|
||||
|
||||
// return true if found the end, or trailing stuff is allowed
|
||||
/* return true if found the end, or trailing stuff is allowed */
|
||||
retval = *ep == '\0' || trailing_stuff_ok;
|
||||
|
||||
return retval;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: main.c,v 1.58 2023/09/10 14:59:00 millert Exp $ */
|
||||
/* $OpenBSD: main.c,v 1.59 2023/09/17 14:49:44 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
|
|||
THIS SOFTWARE.
|
||||
****************************************************************/
|
||||
|
||||
const char *version = "version 20230909";
|
||||
const char *version = "version 20230913";
|
||||
|
||||
#define DEBUG
|
||||
#include <stdio.h>
|
||||
|
@ -52,6 +52,7 @@ static size_t maxpfile; /* max program filename */
|
|||
static size_t npfile; /* number of filenames */
|
||||
static size_t curpfile; /* current filename */
|
||||
|
||||
bool CSV = false; /* true for csv input */
|
||||
bool safe = false; /* true => "safe" mode */
|
||||
bool do_posix = false; /* true => POSIX mode */
|
||||
|
||||
|
@ -170,6 +171,12 @@ int main(int argc, char *argv[])
|
|||
argv++;
|
||||
break;
|
||||
}
|
||||
if (strcmp(argv[1], "--csv") == 0) { /* turn on csv input processing */
|
||||
CSV = true;
|
||||
argc--;
|
||||
argv++;
|
||||
continue;
|
||||
}
|
||||
switch (argv[1][1]) {
|
||||
case 's':
|
||||
if (strcmp(argv[1], "-safe") == 0)
|
||||
|
@ -179,7 +186,7 @@ int main(int argc, char *argv[])
|
|||
fn = getarg(&argc, &argv, "no program filename");
|
||||
if (npfile >= maxpfile) {
|
||||
maxpfile += 20;
|
||||
pfile = (char **) realloc(pfile, maxpfile * sizeof(*pfile));
|
||||
pfile = (char **) reallocarray(pfile, maxpfile, sizeof(*pfile));
|
||||
if (pfile == NULL)
|
||||
FATAL("error allocating space for -f options");
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: proto.h,v 1.21 2023/09/10 14:59:00 millert Exp $ */
|
||||
/* $OpenBSD: proto.h,v 1.22 2023/09/17 14:49:44 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
@ -44,14 +44,13 @@ extern fa *mkdfa(const char *, bool);
|
|||
extern int makeinit(fa *, bool);
|
||||
extern void penter(Node *);
|
||||
extern void freetr(Node *);
|
||||
extern int hexstr(const uschar **);
|
||||
extern int quoted(const uschar **);
|
||||
extern char *cclenter(const char *);
|
||||
extern int *cclenter(const char *);
|
||||
extern noreturn void overflo(const char *);
|
||||
extern void cfoll(fa *, Node *);
|
||||
extern int first(Node *);
|
||||
extern void follow(Node *);
|
||||
extern int member(int, const char *);
|
||||
extern int member(int, int *);
|
||||
extern int match(fa *, const char *);
|
||||
extern int pmatch(fa *, const char *);
|
||||
extern int nematch(fa *, const char *);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: run.c,v 1.74 2022/09/21 01:42:59 millert Exp $ */
|
||||
/* $OpenBSD: run.c,v 1.75 2023/09/17 14:49:44 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
@ -27,7 +27,6 @@ THIS SOFTWARE.
|
|||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <wchar.h>
|
||||
#include <wctype.h>
|
||||
#include <fcntl.h>
|
||||
#include <setjmp.h>
|
||||
|
@ -41,8 +40,10 @@ THIS SOFTWARE.
|
|||
#include "awk.h"
|
||||
#include "awkgram.tab.h"
|
||||
|
||||
|
||||
static void stdinit(void);
|
||||
static void flush_all(void);
|
||||
static char *wide_char_to_byte_str(int rune, size_t *outlen);
|
||||
|
||||
#if 1
|
||||
#define tempfree(x) do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
|
||||
|
@ -580,11 +581,225 @@ Cell *intest(Node **a, int n) /* a[0] is index (list), a[1] is symtab */
|
|||
}
|
||||
|
||||
|
||||
/* ======== utf-8 code ========== */
|
||||
|
||||
/*
|
||||
* Awk strings can contain ascii, random 8-bit items (eg Latin-1),
|
||||
* or utf-8. u8_isutf tests whether a string starts with a valid
|
||||
* utf-8 sequence, and returns 0 if not (e.g., high bit set).
|
||||
* u8_nextlen returns length of next valid sequence, which is
|
||||
* 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
|
||||
* u8_strlen returns length of string in valid utf-8 sequences
|
||||
* and/or high-bit bytes. Conversion functions go between byte
|
||||
* number and character number.
|
||||
*
|
||||
* In theory, this behaves the same as before for non-utf8 bytes.
|
||||
*
|
||||
* Limited checking! This is a potential security hole.
|
||||
*/
|
||||
|
||||
/* is s the beginning of a valid utf-8 string? */
|
||||
/* return length 1..4 if yes, 0 if no */
|
||||
int u8_isutf(const char *s)
|
||||
{
|
||||
int n, ret;
|
||||
unsigned char c;
|
||||
|
||||
c = s[0];
|
||||
if (c < 128)
|
||||
return 1; /* what if it's 0? */
|
||||
|
||||
n = strlen(s);
|
||||
if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
|
||||
ret = 2; /* 110xxxxx 10xxxxxx */
|
||||
} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
|
||||
&& (s[2] & 0xC0) == 0x80) {
|
||||
ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
|
||||
} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
|
||||
&& (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
|
||||
ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
} else {
|
||||
ret = 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Convert (prefix of) utf8 string to utf-32 rune. */
|
||||
/* Sets *rune to the value, returns the length. */
|
||||
/* No error checking: watch out. */
|
||||
int u8_rune(int *rune, const char *s)
|
||||
{
|
||||
int n, ret;
|
||||
unsigned char c;
|
||||
|
||||
c = s[0];
|
||||
if (c < 128) {
|
||||
*rune = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
n = strlen(s);
|
||||
if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
|
||||
*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
|
||||
ret = 2;
|
||||
} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
|
||||
&& (s[2] & 0xC0) == 0x80) {
|
||||
*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
|
||||
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
||||
ret = 3;
|
||||
} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
|
||||
&& (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
|
||||
*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
|
||||
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
ret = 4;
|
||||
} else {
|
||||
*rune = c;
|
||||
ret = 1;
|
||||
}
|
||||
return ret; /* returns one byte if sequence doesn't look like utf */
|
||||
}
|
||||
|
||||
/* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
|
||||
int u8_nextlen(const char *s)
|
||||
{
|
||||
int len;
|
||||
|
||||
len = u8_isutf(s);
|
||||
if (len == 0)
|
||||
len = 1;
|
||||
return len;
|
||||
}
|
||||
|
||||
/* return number of utf characters or single non-utf bytes */
|
||||
int u8_strlen(const char *s)
|
||||
{
|
||||
int i, len, n, totlen;
|
||||
unsigned char c;
|
||||
|
||||
n = strlen(s);
|
||||
totlen = 0;
|
||||
for (i = 0; i < n; i += len) {
|
||||
c = s[i];
|
||||
if (c < 128) {
|
||||
len = 1;
|
||||
} else {
|
||||
len = u8_nextlen(&s[i]);
|
||||
}
|
||||
totlen++;
|
||||
if (i > n)
|
||||
FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
|
||||
}
|
||||
return totlen;
|
||||
}
|
||||
|
||||
/* convert utf-8 char number in a string to its byte offset */
|
||||
int u8_char2byte(const char *s, int charnum)
|
||||
{
|
||||
int n;
|
||||
int bytenum = 0;
|
||||
|
||||
while (charnum > 0) {
|
||||
n = u8_nextlen(s);
|
||||
s += n;
|
||||
bytenum += n;
|
||||
charnum--;
|
||||
}
|
||||
return bytenum;
|
||||
}
|
||||
|
||||
/* convert byte offset in s to utf-8 char number that starts there */
|
||||
int u8_byte2char(const char *s, int bytenum)
|
||||
{
|
||||
int i, len, b;
|
||||
int charnum = 0; /* BUG: what origin? */
|
||||
/* should be 0 to match start==0 which means no match */
|
||||
|
||||
b = strlen(s);
|
||||
if (bytenum > b) {
|
||||
return -1; /* ??? */
|
||||
}
|
||||
for (i = 0; i <= bytenum; i += len) {
|
||||
len = u8_nextlen(s+i);
|
||||
charnum++;
|
||||
}
|
||||
return charnum;
|
||||
}
|
||||
|
||||
/* runetochar() adapted from rune.c in the Plan 9 distributione */
|
||||
|
||||
enum
|
||||
{
|
||||
Runeerror = 128, /* from somewhere else */
|
||||
Runemax = 0x10FFFF,
|
||||
|
||||
Bit1 = 7,
|
||||
Bitx = 6,
|
||||
Bit2 = 5,
|
||||
Bit3 = 4,
|
||||
Bit4 = 3,
|
||||
Bit5 = 2,
|
||||
|
||||
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||
|
||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
|
||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
|
||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
|
||||
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
|
||||
|
||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||
|
||||
};
|
||||
|
||||
int runetochar(char *str, int c)
|
||||
{
|
||||
/* one character sequence 00000-0007F => 00-7F */
|
||||
if (c <= Rune1) {
|
||||
str[0] = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* two character sequence 00080-007FF => T2 Tx */
|
||||
if (c <= Rune2) {
|
||||
str[0] = T2 | (c >> 1*Bitx);
|
||||
str[1] = Tx | (c & Maskx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
/* three character sequence 00800-0FFFF => T3 Tx Tx */
|
||||
if (c > Runemax)
|
||||
c = Runeerror;
|
||||
if (c <= Rune3) {
|
||||
str[0] = T3 | (c >> 2*Bitx);
|
||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[2] = Tx | (c & Maskx);
|
||||
return 3;
|
||||
}
|
||||
|
||||
/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
|
||||
str[0] = T4 | (c >> 3*Bitx);
|
||||
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[3] = Tx | (c & Maskx);
|
||||
return 4;
|
||||
}
|
||||
|
||||
|
||||
/* ========== end of utf8 code =========== */
|
||||
|
||||
|
||||
|
||||
Cell *matchop(Node **a, int n) /* ~ and match() */
|
||||
{
|
||||
Cell *x, *y;
|
||||
char *s, *t;
|
||||
int i;
|
||||
int cstart, cpatlen, len;
|
||||
fa *pfa;
|
||||
int (*mf)(fa *, const char *) = match, mode = 0;
|
||||
|
||||
|
@ -605,9 +820,21 @@ Cell *matchop(Node **a, int n) /* ~ and match() */
|
|||
}
|
||||
tempfree(x);
|
||||
if (n == MATCHFCN) {
|
||||
int start = patbeg - s + 1;
|
||||
if (patlen < 0)
|
||||
start = 0;
|
||||
int start = patbeg - s + 1; /* origin 1 */
|
||||
if (patlen < 0) {
|
||||
start = 0; /* not found */
|
||||
} else {
|
||||
cstart = u8_byte2char(s, start-1);
|
||||
cpatlen = 0;
|
||||
for (i = 0; i < patlen; i += len) {
|
||||
len = u8_nextlen(patbeg+i);
|
||||
cpatlen++;
|
||||
}
|
||||
|
||||
start = cstart;
|
||||
patlen = cpatlen;
|
||||
}
|
||||
|
||||
setfval(rstartloc, (Awkfloat) start);
|
||||
setfval(rlengthloc, (Awkfloat) patlen);
|
||||
x = gettemp();
|
||||
|
@ -658,10 +885,15 @@ Cell *relop(Node **a, int n) /* a[0 < a[1], etc. */
|
|||
int i;
|
||||
Cell *x, *y;
|
||||
Awkfloat j;
|
||||
bool x_is_nan, y_is_nan;
|
||||
|
||||
x = execute(a[0]);
|
||||
y = execute(a[1]);
|
||||
x_is_nan = isnan(x->fval);
|
||||
y_is_nan = isnan(y->fval);
|
||||
if (x->tval&NUM && y->tval&NUM) {
|
||||
if ((x_is_nan || y_is_nan) && n != NE)
|
||||
return(False);
|
||||
j = x->fval - y->fval;
|
||||
i = j<0? -1: (j>0? 1: 0);
|
||||
} else {
|
||||
|
@ -674,7 +906,8 @@ Cell *relop(Node **a, int n) /* a[0 < a[1], etc. */
|
|||
else return(False);
|
||||
case LE: if (i<=0) return(True);
|
||||
else return(False);
|
||||
case NE: if (i!=0) return(True);
|
||||
case NE: if (x_is_nan && y_is_nan) return(True);
|
||||
else if (i!=0) return(True);
|
||||
else return(False);
|
||||
case EQ: if (i == 0) return(True);
|
||||
else return(False);
|
||||
|
@ -743,6 +976,7 @@ Cell *indirect(Node **a, int n) /* $( a[0] ) */
|
|||
Cell *substr(Node **a, int nnn) /* substr(a[0], a[1], a[2]) */
|
||||
{
|
||||
int k, m, n;
|
||||
int mb, nb;
|
||||
char *s;
|
||||
int temp;
|
||||
Cell *x, *y, *z = NULL;
|
||||
|
@ -778,12 +1012,16 @@ Cell *substr(Node **a, int nnn) /* substr(a[0], a[1], a[2]) */
|
|||
n = 0;
|
||||
else if (n > k - m)
|
||||
n = k - m;
|
||||
/* m is start, n is length from there */
|
||||
DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
|
||||
y = gettemp();
|
||||
temp = s[n+m-1]; /* with thanks to John Linderman */
|
||||
s[n+m-1] = '\0';
|
||||
setsval(y, s + m - 1);
|
||||
s[n+m-1] = temp;
|
||||
mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
|
||||
nb = u8_char2byte(s, m-1+n); /* byte offset of end+1 char in s */
|
||||
|
||||
temp = s[nb]; /* with thanks to John Linderman */
|
||||
s[nb] = '\0';
|
||||
setsval(y, s + mb);
|
||||
s[nb] = temp;
|
||||
tempfree(x);
|
||||
return(y);
|
||||
}
|
||||
|
@ -804,7 +1042,15 @@ Cell *sindex(Node **a, int nnn) /* index(a[0], a[1]) */
|
|||
for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
|
||||
continue;
|
||||
if (*p2 == '\0') {
|
||||
v = (Awkfloat) (p1 - s1 + 1); /* origin 1 */
|
||||
/* v = (Awkfloat) (p1 - s1 + 1); origin 1 */
|
||||
|
||||
/* should be a function: used in match() as well */
|
||||
int i, len;
|
||||
v = 0;
|
||||
for (i = 0; i < p1-s1+1; i += len) {
|
||||
len = u8_nextlen(s1+i);
|
||||
v++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -814,6 +1060,18 @@ Cell *sindex(Node **a, int nnn) /* index(a[0], a[1]) */
|
|||
return(z);
|
||||
}
|
||||
|
||||
int has_utf8(char *s) /* return 1 if s contains any utf-8 (2 bytes or more) character */
|
||||
{
|
||||
int n;
|
||||
|
||||
for (n = 0; *s != 0; s += n) {
|
||||
n = u8_nextlen(s);
|
||||
if (n > 1)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define MAXNUMSIZE 50
|
||||
|
||||
int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like conversions */
|
||||
|
@ -856,7 +1114,6 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like co
|
|||
s += 2;
|
||||
continue;
|
||||
}
|
||||
/* have to be real careful in case this is a huge number, eg, %100000d */
|
||||
fmtwd = atoi(s+1);
|
||||
if (fmtwd < 0)
|
||||
fmtwd = -fmtwd;
|
||||
|
@ -929,7 +1186,8 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like co
|
|||
n = fmtwd;
|
||||
adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
|
||||
switch (flag) {
|
||||
case '?': snprintf(p, BUFSZ(p), "%s", fmt); /* unknown, so dump it too */
|
||||
case '?':
|
||||
snprintf(p, BUFSZ(p), "%s", fmt); /* unknown, so dump it too */
|
||||
t = getsval(x);
|
||||
n = strlen(t);
|
||||
if (fmtwd > n)
|
||||
|
@ -943,29 +1201,176 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like co
|
|||
case 'f': snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
|
||||
case 'd': snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
|
||||
case 'u': snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
|
||||
case 's':
|
||||
|
||||
case 's': {
|
||||
t = getsval(x);
|
||||
n = strlen(t);
|
||||
/* if simple format or no utf-8 in the string, sprintf works */
|
||||
if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
|
||||
if (fmtwd > n)
|
||||
n = fmtwd;
|
||||
if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
|
||||
FATAL("huge string/format (%d chars) in printf %.30s... ran format() out of memory", n, t);
|
||||
FATAL("huge string/format (%d chars) in printf %.30s..." \
|
||||
" ran format() out of memory", n, t);
|
||||
snprintf(p, BUFSZ(p), fmt, t);
|
||||
break;
|
||||
case 'c':
|
||||
}
|
||||
|
||||
/* get here if string has utf-8 chars and fmt is not plain %s */
|
||||
/* "%-w.ps", where -, w and .p are all optional */
|
||||
/* '0' before the w is a flag character */
|
||||
/* fmt points at % */
|
||||
int ljust = 0, wid = 0, prec = n, pad = 0;
|
||||
char *f = fmt+1;
|
||||
if (f[0] == '-') {
|
||||
ljust = 1;
|
||||
f++;
|
||||
}
|
||||
// flags '0' and '+' are recognized but skipped
|
||||
if (f[0] == '0') {
|
||||
f++;
|
||||
if (f[0] == '+')
|
||||
f++;
|
||||
}
|
||||
if (f[0] == '+') {
|
||||
f++;
|
||||
if (f[0] == '0')
|
||||
f++;
|
||||
}
|
||||
if (isdigit((uschar)f[0])) { /* there is a wid */
|
||||
wid = strtol(f, &f, 10);
|
||||
}
|
||||
if (f[0] == '.') { /* there is a .prec */
|
||||
prec = strtol(++f, &f, 10);
|
||||
}
|
||||
if (prec > u8_strlen(t))
|
||||
prec = u8_strlen(t);
|
||||
pad = wid>prec ? wid - prec : 0; // has to be >= 0
|
||||
int i, k, n;
|
||||
|
||||
if (ljust) { // print prec chars from t, then pad blanks
|
||||
n = u8_char2byte(t, prec);
|
||||
for (k = 0; k < n; k++) {
|
||||
//putchar(t[k]);
|
||||
*p++ = t[k];
|
||||
}
|
||||
for (i = 0; i < pad; i++) {
|
||||
//printf(" ");
|
||||
*p++ = ' ';
|
||||
}
|
||||
} else { // print pad blanks, then prec chars from t
|
||||
for (i = 0; i < pad; i++) {
|
||||
//printf(" ");
|
||||
*p++ = ' ';
|
||||
}
|
||||
n = u8_char2byte(t, prec);
|
||||
for (k = 0; k < n; k++) {
|
||||
//putchar(t[k]);
|
||||
*p++ = t[k];
|
||||
}
|
||||
}
|
||||
*p = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'c': {
|
||||
/*
|
||||
* If a numeric value is given, awk should just turn
|
||||
* it into a character and print it:
|
||||
* BEGIN { printf("%c\n", 65) }
|
||||
* prints "A".
|
||||
*
|
||||
* But what if the numeric value is > 128 and
|
||||
* represents a valid Unicode code point?!? We do
|
||||
* our best to convert it back into UTF-8. If we
|
||||
* can't, we output the encoding of the Unicode
|
||||
* "invalid character", 0xFFFD.
|
||||
*/
|
||||
if (isnum(x)) {
|
||||
if ((int)getfval(x))
|
||||
snprintf(p, BUFSZ(p), fmt, (int) getfval(x));
|
||||
int charval = (int) getfval(x);
|
||||
|
||||
if (charval != 0) {
|
||||
if (charval < 128)
|
||||
snprintf(p, BUFSZ(p), fmt, charval);
|
||||
else {
|
||||
// possible unicode character
|
||||
size_t count;
|
||||
char *bs = wide_char_to_byte_str(charval, &count);
|
||||
|
||||
if (bs == NULL) { // invalid character
|
||||
// use unicode invalid character, 0xFFFD
|
||||
bs = "\357\277\275";
|
||||
count = 3;
|
||||
}
|
||||
t = bs;
|
||||
n = count;
|
||||
goto format_percent_c;
|
||||
}
|
||||
} else {
|
||||
*p++ = '\0'; /* explicit null byte */
|
||||
*p = '\0'; /* next output will start here */
|
||||
}
|
||||
} else
|
||||
break;
|
||||
}
|
||||
t = getsval(x);
|
||||
n = u8_nextlen(t);
|
||||
format_percent_c:
|
||||
if (n < 2) { /* not utf8 */
|
||||
snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
|
||||
break;
|
||||
}
|
||||
|
||||
// utf8 character, almost same song and dance as for %s
|
||||
int ljust = 0, wid = 0, prec = n, pad = 0;
|
||||
char *f = fmt+1;
|
||||
if (f[0] == '-') {
|
||||
ljust = 1;
|
||||
f++;
|
||||
}
|
||||
// flags '0' and '+' are recognized but skipped
|
||||
if (f[0] == '0') {
|
||||
f++;
|
||||
if (f[0] == '+')
|
||||
f++;
|
||||
}
|
||||
if (f[0] == '+') {
|
||||
f++;
|
||||
if (f[0] == '0')
|
||||
f++;
|
||||
}
|
||||
if (isdigit((uschar)f[0])) { /* there is a wid */
|
||||
wid = strtol(f, &f, 10);
|
||||
}
|
||||
if (f[0] == '.') { /* there is a .prec */
|
||||
prec = strtol(++f, &f, 10);
|
||||
}
|
||||
if (prec > 1) // %c --> only one character
|
||||
prec = 1;
|
||||
pad = wid>prec ? wid - prec : 0; // has to be >= 0
|
||||
int i;
|
||||
|
||||
if (ljust) { // print one char from t, then pad blanks
|
||||
for (int i = 0; i < n; i++)
|
||||
*p++ = t[i];
|
||||
for (i = 0; i < pad; i++) {
|
||||
//printf(" ");
|
||||
*p++ = ' ';
|
||||
}
|
||||
} else { // print pad blanks, then prec chars from t
|
||||
for (i = 0; i < pad; i++) {
|
||||
//printf(" ");
|
||||
*p++ = ' ';
|
||||
}
|
||||
for (int i = 0; i < n; i++)
|
||||
*p++ = t[i];
|
||||
}
|
||||
*p = 0;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
FATAL("can't happen: bad conversion %c in format()", flag);
|
||||
}
|
||||
|
||||
tempfree(x);
|
||||
p += strlen(p);
|
||||
s++;
|
||||
|
@ -1265,7 +1670,7 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */
|
|||
char *origfs = NULL;
|
||||
int sep;
|
||||
char temp, num[50];
|
||||
int n, tempstat, arg3type;
|
||||
int j, n, tempstat, arg3type;
|
||||
double result;
|
||||
|
||||
y = execute(a[0]); /* source string */
|
||||
|
@ -1274,20 +1679,22 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */
|
|||
FATAL("out of space in split");
|
||||
tempfree(y);
|
||||
arg3type = ptoi(a[3]);
|
||||
if (a[2] == NULL) /* fs string */
|
||||
if (a[2] == NULL) { /* BUG: CSV should override implicit fs but not explicit */
|
||||
fs = getsval(fsloc);
|
||||
else if (arg3type == STRING) { /* split(str,arr,"string") */
|
||||
} else if (arg3type == STRING) { /* split(str,arr,"string") */
|
||||
x = execute(a[2]);
|
||||
fs = origfs = strdup(getsval(x));
|
||||
if (fs == NULL)
|
||||
FATAL("out of space in split");
|
||||
tempfree(x);
|
||||
} else if (arg3type == REGEXPR)
|
||||
} else if (arg3type == REGEXPR) {
|
||||
fs = "(regexpr)"; /* split(str,arr,/regexpr/) */
|
||||
else
|
||||
} else {
|
||||
FATAL("illegal type of split");
|
||||
}
|
||||
sep = *fs;
|
||||
ap = execute(a[1]); /* array name */
|
||||
/* BUG 7/26/22: this appears not to reset array: see C1/asplit */
|
||||
freesymtab(ap);
|
||||
DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
|
||||
ap->tval &= ~STR;
|
||||
|
@ -1341,7 +1748,41 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */
|
|||
setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
|
||||
spdone:
|
||||
pfa = NULL;
|
||||
} else if (sep == ' ') {
|
||||
|
||||
} else if (a[2] == NULL && CSV) { /* CSV only if no explicit separator */
|
||||
char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
|
||||
for (;;) {
|
||||
char *fr = newt;
|
||||
n++;
|
||||
if (*s == '"' ) { /* start of "..." */
|
||||
for (s++ ; *s != '\0'; ) {
|
||||
if (*s == '"' && s[1] != '\0' && s[1] == '"') {
|
||||
s += 2; /* doubled quote */
|
||||
*fr++ = '"';
|
||||
} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
|
||||
s++; /* skip over closing quote */
|
||||
break;
|
||||
} else {
|
||||
*fr++ = *s++;
|
||||
}
|
||||
}
|
||||
*fr++ = 0;
|
||||
} else { /* unquoted field */
|
||||
while (*s != ',' && *s != '\0')
|
||||
*fr++ = *s++;
|
||||
*fr++ = 0;
|
||||
}
|
||||
snprintf(num, sizeof(num), "%d", n);
|
||||
if (is_number(newt, &result))
|
||||
setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
|
||||
else
|
||||
setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
|
||||
if (*s++ == '\0')
|
||||
break;
|
||||
}
|
||||
free(newt);
|
||||
|
||||
} else if (!CSV && sep == ' ') { /* usual case: split on white space */
|
||||
for (n = 0; ; ) {
|
||||
#define ISWS(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
|
||||
while (ISWS(*s))
|
||||
|
@ -1364,19 +1805,25 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */
|
|||
if (*s != '\0')
|
||||
s++;
|
||||
}
|
||||
|
||||
} else if (sep == 0) { /* new: split(s, a, "") => 1 char/elem */
|
||||
for (n = 0; *s != '\0'; s++) {
|
||||
char buf[2];
|
||||
for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
|
||||
char buf[10];
|
||||
n++;
|
||||
snprintf(num, sizeof(num), "%d", n);
|
||||
buf[0] = *s;
|
||||
buf[1] = '\0';
|
||||
|
||||
for (j = 0; j < u8_nextlen(s); j++) {
|
||||
buf[j] = s[j];
|
||||
}
|
||||
buf[j] = '\0';
|
||||
|
||||
if (isdigit((uschar)buf[0]))
|
||||
setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
|
||||
else
|
||||
setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
|
||||
}
|
||||
} else if (*s != '\0') {
|
||||
|
||||
} else if (*s != '\0') { /* some random single character */
|
||||
for (;;) {
|
||||
n++;
|
||||
t = s;
|
||||
|
@ -1535,6 +1982,7 @@ static char *nawk_convert(const char *s, int (*fun_c)(int),
|
|||
size_t n = 0;
|
||||
wchar_t wc;
|
||||
size_t sz = MB_CUR_MAX;
|
||||
int unused;
|
||||
|
||||
if (sz == 1) {
|
||||
buf = tostring(s);
|
||||
|
@ -1554,7 +2002,7 @@ static char *nawk_convert(const char *s, int (*fun_c)(int),
|
|||
* doesn't work.)
|
||||
* Increment said variable to avoid a different warning.
|
||||
*/
|
||||
int unused = wctomb(NULL, L'\0');
|
||||
unused = wctomb(NULL, L'\0');
|
||||
unused++;
|
||||
|
||||
ps = s;
|
||||
|
@ -1629,7 +2077,7 @@ Cell *bltin(Node **a, int n) /* builtin functions. a[0] is type, a[1] is arg lis
|
|||
if (isarr(x))
|
||||
u = ((Array *) x->sval)->nelem; /* GROT. should be function*/
|
||||
else
|
||||
u = strlen(getsval(x));
|
||||
u = u8_strlen(getsval(x));
|
||||
break;
|
||||
case FLOG:
|
||||
errno = 0;
|
||||
|
@ -2402,3 +2850,41 @@ void backsub(char **pb_ptr, const char **sptr_ptr) /* handle \\& variations */
|
|||
*pb_ptr = pb;
|
||||
*sptr_ptr = sptr;
|
||||
}
|
||||
|
||||
static char *wide_char_to_byte_str(int rune, size_t *outlen)
|
||||
{
|
||||
static char buf[5];
|
||||
int len;
|
||||
|
||||
if (rune < 0 || rune > 0x10FFFF)
|
||||
return NULL;
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
|
||||
len = 0;
|
||||
if (rune <= 0x0000007F) {
|
||||
buf[len++] = rune;
|
||||
} else if (rune <= 0x000007FF) {
|
||||
// 110xxxxx 10xxxxxx
|
||||
buf[len++] = 0xC0 | (rune >> 6);
|
||||
buf[len++] = 0x80 | (rune & 0x3F);
|
||||
} else if (rune <= 0x0000FFFF) {
|
||||
// 1110xxxx 10xxxxxx 10xxxxxx
|
||||
buf[len++] = 0xE0 | (rune >> 12);
|
||||
buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
|
||||
buf[len++] = 0x80 | (rune & 0x3F);
|
||||
|
||||
} else {
|
||||
// 0x00010000 - 0x10FFFF
|
||||
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
buf[len++] = 0xF0 | (rune >> 18);
|
||||
buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
|
||||
buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
|
||||
buf[len++] = 0x80 | (rune & 0x3F);
|
||||
}
|
||||
|
||||
*outlen = len;
|
||||
buf[len++] = '\0';
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $OpenBSD: tran.c,v 1.36 2022/09/21 01:42:59 millert Exp $ */
|
||||
/* $OpenBSD: tran.c,v 1.37 2023/09/17 14:49:44 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
|
@ -309,7 +309,7 @@ Awkfloat setfval(Cell *vp, Awkfloat f) /* set float val of a Cell */
|
|||
} else if (&vp->fval == NF) {
|
||||
donerec = false; /* mark $0 invalid */
|
||||
setlastfld(f);
|
||||
DPRINTF("setting NF to %g\n", f);
|
||||
DPRINTF("setfval: setting NF to %g\n", f);
|
||||
} else if (isrec(vp)) {
|
||||
donefld = false; /* mark $1... invalid */
|
||||
donerec = true;
|
||||
|
@ -349,6 +349,10 @@ char *setsval(Cell *vp, const char *s) /* set string val of a Cell */
|
|||
(void*)vp, NN(vp->nval), s, vp->tval, donerec, donefld);
|
||||
if ((vp->tval & (NUM | STR)) == 0)
|
||||
funnyvar(vp, "assign to");
|
||||
if (CSV && (vp == rsloc))
|
||||
WARNING("danger: don't set RS when --csv is in effect");
|
||||
if (CSV && (vp == fsloc))
|
||||
WARNING("danger: don't set FS when --csv is in effect");
|
||||
if (isfld(vp)) {
|
||||
donerec = false; /* mark $0 invalid */
|
||||
fldno = atoi(vp->nval);
|
||||
|
@ -376,7 +380,7 @@ char *setsval(Cell *vp, const char *s) /* set string val of a Cell */
|
|||
donerec = false; /* mark $0 invalid */
|
||||
f = getfval(vp);
|
||||
setlastfld(f);
|
||||
DPRINTF("setting NF to %g\n", f);
|
||||
DPRINTF("setsval: setting NF to %g\n", f);
|
||||
}
|
||||
|
||||
return(vp->sval);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.\" $OpenBSD: tmux.1,v 1.932 2023/09/15 06:31:49 nicm Exp $
|
||||
.\" $OpenBSD: tmux.1,v 1.933 2023/09/16 16:18:29 nicm Exp $
|
||||
.\"
|
||||
.\" Copyright (c) 2007 Nicholas Marriott <nicholas.marriott@gmail.com>
|
||||
.\"
|
||||
|
@ -14,7 +14,7 @@
|
|||
.\" IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
|
||||
.\" OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
.\"
|
||||
.Dd $Mdocdate: September 15 2023 $
|
||||
.Dd $Mdocdate: September 16 2023 $
|
||||
.Dt TMUX 1
|
||||
.Os
|
||||
.Sh NAME
|
||||
|
@ -2004,18 +2004,6 @@ Move the cursor to the start of the line.
|
|||
(emacs: M-R)
|
||||
.Xc
|
||||
Move to the top line.
|
||||
.It Xo
|
||||
.Ic next-prompt
|
||||
(vi: C-n)
|
||||
(emacs: C-n)
|
||||
.Xc
|
||||
Move to the next prompt.
|
||||
.It Xo
|
||||
.Ic previous-prompt
|
||||
(vi: C-p)
|
||||
(emacs: C-p)
|
||||
.Xc
|
||||
Move to the previous prompt.
|
||||
.El
|
||||
.Pp
|
||||
The search commands come in several varieties:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue