From 5fd83771641d15c418f747bd343ba6738d3875f7 Mon Sep 17 00:00:00 2001 From: Cameron Katri Date: Sun, 9 May 2021 14:20:58 -0400 Subject: Import macOS userland adv_cmds-176 basic_cmds-55 bootstrap_cmds-116.100.1 developer_cmds-66 diskdev_cmds-667.40.1 doc_cmds-53.60.1 file_cmds-321.40.3 mail_cmds-35 misc_cmds-34 network_cmds-606.40.1 patch_cmds-17 remote_cmds-63 shell_cmds-216.60.1 system_cmds-880.60.2 text_cmds-106 --- text_cmds/tr/cmap.c | 212 ++++++++++++++++++++++++ text_cmds/tr/cmap.h | 83 ++++++++++ text_cmds/tr/cset.c | 290 ++++++++++++++++++++++++++++++++ text_cmds/tr/cset.h | 74 +++++++++ text_cmds/tr/extern.h | 55 ++++++ text_cmds/tr/str.c | 450 ++++++++++++++++++++++++++++++++++++++++++++++++++ text_cmds/tr/tr.1 | 420 ++++++++++++++++++++++++++++++++++++++++++++++ text_cmds/tr/tr.c | 378 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 1962 insertions(+) create mode 100644 text_cmds/tr/cmap.c create mode 100644 text_cmds/tr/cmap.h create mode 100644 text_cmds/tr/cset.c create mode 100644 text_cmds/tr/cset.h create mode 100644 text_cmds/tr/extern.h create mode 100644 text_cmds/tr/str.c create mode 100644 text_cmds/tr/tr.1 create mode 100644 text_cmds/tr/tr.c (limited to 'text_cmds/tr') diff --git a/text_cmds/tr/cmap.c b/text_cmds/tr/cmap.c new file mode 100644 index 0000000..1edd9d2 --- /dev/null +++ b/text_cmds/tr/cmap.c @@ -0,0 +1,212 @@ +/*- + * Copyright (c) 2004 Tim J. Robbins. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * "Character map" ADT. Stores mappings between pairs of characters in a + * splay tree, with a lookup table cache to simplify looking up the first + * bunch of characters (which are presumably more common than others). + */ + +#include +__FBSDID("$FreeBSD: src/usr.bin/tr/cmap.c,v 1.2 2004/07/14 08:36:09 tjr Exp $"); + +#include +#include +#include +#include +#include +#include "cmap.h" + +static struct cmapnode *cmap_splay(struct cmapnode *, wint_t); + +/* + * cmap_alloc -- + * Allocate a character map. + */ +struct cmap * +cmap_alloc(void) +{ + struct cmap *cm; + + cm = malloc(sizeof(*cm)); + if (cm == NULL) + return (NULL); + cm->cm_root = NULL; + cm->cm_def = CM_DEF_SELF; + cm->cm_havecache = false; + cm->cm_min = cm->cm_max = 0; + return (cm); +} + +/* + * cmap_add -- + * Add a mapping from "from" to "to" to the map. + */ +bool +cmap_add(struct cmap *cm, wint_t from, wint_t to) +{ + struct cmapnode *cmn, *ncmn; + + cm->cm_havecache = false; + + if (cm->cm_root == NULL) { + cmn = malloc(sizeof(*cmn)); + if (cmn == NULL) + return (false); + cmn->cmn_from = from; + cmn->cmn_to = to; + cmn->cmn_left = cmn->cmn_right = NULL; + cm->cm_root = cmn; + cm->cm_min = cm->cm_max = from; + return (true); + } + + cmn = cm->cm_root = cmap_splay(cm->cm_root, from); + + if (cmn->cmn_from == from) { + cmn->cmn_to = to; + return (true); + } + + ncmn = malloc(sizeof(*ncmn)); + if (ncmn == NULL) + return (false); + ncmn->cmn_from = from; + ncmn->cmn_to = to; + if (from < cmn->cmn_from) { + ncmn->cmn_left = cmn->cmn_left; + ncmn->cmn_right = cmn; + cmn->cmn_left = NULL; + } else { + ncmn->cmn_right = cmn->cmn_right; + ncmn->cmn_left = cmn; + cmn->cmn_right = NULL; + } + if (from < cm->cm_min) + cm->cm_min = from; + if (from > cm->cm_max) + cm->cm_max = from; + cm->cm_root = ncmn; + + return (true); +} + +/* + * cmap_lookup_hard -- + * Look up the mapping for a character without using the cache. + */ +wint_t +cmap_lookup_hard(struct cmap *cm, wint_t ch) +{ + + if (cm->cm_root != NULL) { + cm->cm_root = cmap_splay(cm->cm_root, ch); + if (cm->cm_root->cmn_from == ch) + return (cm->cm_root->cmn_to); + } + return (cm->cm_def == CM_DEF_SELF ? ch : cm->cm_def); +} + +/* + * cmap_cache -- + * Update the cache. + */ +void +cmap_cache(struct cmap *cm) +{ + wint_t ch; + + for (ch = 0; ch < CM_CACHE_SIZE; ch++) + cm->cm_cache[ch] = cmap_lookup_hard(cm, ch); + + cm->cm_havecache = true; +} + +/* + * cmap_default -- + * Change the value that characters without mappings map to, and + * return the old value. The special character value CM_MAP_SELF + * means characters map to themselves. + */ +wint_t +cmap_default(struct cmap *cm, wint_t def) +{ + wint_t old; + + old = cm->cm_def; + cm->cm_def = def; + cm->cm_havecache = false; + return (old); +} + +static struct cmapnode * +cmap_splay(struct cmapnode *t, wint_t ch) +{ + struct cmapnode N, *l, *r, *y; + + /* + * Based on public domain code from Sleator. + */ + + assert(t != NULL); + + N.cmn_left = N.cmn_right = NULL; + l = r = &N; + for (;;) { + if (ch < t->cmn_from) { + if (t->cmn_left != NULL && + ch < t->cmn_left->cmn_from) { + y = t->cmn_left; + t->cmn_left = y->cmn_right; + y->cmn_right = t; + t = y; + } + if (t->cmn_left == NULL) + break; + r->cmn_left = t; + r = t; + t = t->cmn_left; + } else if (ch > t->cmn_from) { + if (t->cmn_right != NULL && + ch > t->cmn_right->cmn_from) { + y = t->cmn_right; + t->cmn_right = y->cmn_left; + y->cmn_left = t; + t = y; + } + if (t->cmn_right == NULL) + break; + l->cmn_right = t; + l = t; + t = t->cmn_right; + } else + break; + } + l->cmn_right = t->cmn_left; + r->cmn_left = t->cmn_right; + t->cmn_left = N.cmn_right; + t->cmn_right = N.cmn_left; + return (t); +} diff --git a/text_cmds/tr/cmap.h b/text_cmds/tr/cmap.h new file mode 100644 index 0000000..38e3d0e --- /dev/null +++ b/text_cmds/tr/cmap.h @@ -0,0 +1,83 @@ +/*- + * Copyright (c) 2004 Tim J. Robbins. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.bin/tr/cmap.h,v 1.1 2004/07/09 02:08:07 tjr Exp $ + */ + +#ifndef CMAP_H +#define CMAP_H + +#include +#include +#include + +struct cmapnode { + wint_t cmn_from; + wint_t cmn_to; + struct cmapnode *cmn_left; + struct cmapnode *cmn_right; +}; + +struct cmap { +#define CM_CACHE_SIZE 128 + wint_t cm_cache[CM_CACHE_SIZE]; + bool cm_havecache; + struct cmapnode *cm_root; +#define CM_DEF_SELF -2 + wint_t cm_def; + wint_t cm_min; + wint_t cm_max; +}; + +struct cmap * cmap_alloc(void); +bool cmap_add(struct cmap *, wint_t, wint_t); +wint_t cmap_lookup_hard(struct cmap *, wint_t); +void cmap_cache(struct cmap *); +wint_t cmap_default(struct cmap *, wint_t); + +static __inline wint_t +cmap_lookup(struct cmap *cm, wint_t from) +{ + + if (from < CM_CACHE_SIZE && cm->cm_havecache) + return (cm->cm_cache[from]); + return (cmap_lookup_hard(cm, from)); +} + +static __inline wint_t +cmap_min(struct cmap *cm) +{ + + return (cm->cm_min); +} + +static __inline wint_t +cmap_max(struct cmap *cm) +{ + + return (cm->cm_max); +} + +#endif diff --git a/text_cmds/tr/cset.c b/text_cmds/tr/cset.c new file mode 100644 index 0000000..6e7c217 --- /dev/null +++ b/text_cmds/tr/cset.c @@ -0,0 +1,290 @@ +/*- + * Copyright (c) 2004 Tim J. Robbins. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * "Set of characters" ADT implemented as a splay tree of extents, with + * a lookup table cache to simplify looking up the first bunch of + * characters (which are presumably more common than others). + */ + +#include +__FBSDID("$FreeBSD: src/usr.bin/tr/cset.c,v 1.3 2004/07/14 08:33:14 tjr Exp $"); + +#include +#include +#include +#include +#include +#include "cset.h" + +static struct csnode * cset_delete(struct csnode *, wchar_t); +static __inline int cset_rangecmp(struct csnode *, wchar_t); +static struct csnode * cset_splay(struct csnode *, wchar_t); + +/* + * cset_alloc -- + * Allocate a set of characters. + */ +struct cset * +cset_alloc(void) +{ + struct cset *cs; + + if ((cs = malloc(sizeof(*cs))) == NULL) + return (NULL); + cs->cs_root = NULL; + cs->cs_classes = NULL; + cs->cs_havecache = false; + cs->cs_invert = false; + return (cs); +} + +/* + * cset_add -- + * Add a character to the set. + */ +bool +cset_add(struct cset *cs, wchar_t ch) +{ + struct csnode *csn, *ncsn; + wchar_t oval; + + cs->cs_havecache = false; + + /* + * Inserting into empty tree; new item becomes the root. + */ + if (cs->cs_root == NULL) { + csn = malloc(sizeof(*cs->cs_root)); + if (csn == NULL) + return (false); + csn->csn_left = csn->csn_right = NULL; + csn->csn_min = csn->csn_max = ch; + cs->cs_root = csn; + return (true); + } + + /* + * Splay to check whether the item already exists, and otherwise, + * where we should put it. + */ + csn = cs->cs_root = cset_splay(cs->cs_root, ch); + + /* + * Avoid adding duplicate nodes. + */ + if (cset_rangecmp(csn, ch) == 0) + return (true); + + /* + * Allocate a new node and make it the new root. + */ + ncsn = malloc(sizeof(*ncsn)); + if (ncsn == NULL) + return (false); + ncsn->csn_min = ncsn->csn_max = ch; + if (cset_rangecmp(csn, ch) < 0) { + ncsn->csn_left = csn->csn_left; + ncsn->csn_right = csn; + csn->csn_left = NULL; + } else { + ncsn->csn_right = csn->csn_right; + ncsn->csn_left = csn; + csn->csn_right = NULL; + } + cs->cs_root = ncsn; + + /* + * Coalesce with left and right neighbours if possible. + */ + if (ncsn->csn_left != NULL) { + ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1); + if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) { + oval = ncsn->csn_left->csn_min; + ncsn->csn_left = cset_delete(ncsn->csn_left, + ncsn->csn_left->csn_min); + ncsn->csn_min = oval; + } + } + if (ncsn->csn_right != NULL) { + ncsn->csn_right = cset_splay(ncsn->csn_right, + ncsn->csn_max + 1); + if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) { + oval = ncsn->csn_right->csn_max; + ncsn->csn_right = cset_delete(ncsn->csn_right, + ncsn->csn_right->csn_min); + ncsn->csn_max = oval; + } + } + + return (true); +} + +/* + * cset_in_hard -- + * Determine whether a character is in the set without using + * the cache. + */ +bool +cset_in_hard(struct cset *cs, wchar_t ch) +{ + struct csclass *csc; + + for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next) + if (csc->csc_invert ^ iswctype(ch, csc->csc_type) != 0) + return (cs->cs_invert ^ true); + if (cs->cs_root != NULL) { + cs->cs_root = cset_splay(cs->cs_root, ch); + return (cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch) == 0); + } + return (cs->cs_invert ^ false); +} + +/* + * cset_cache -- + * Update the cache. + */ +void +cset_cache(struct cset *cs) +{ + wchar_t i; + + for (i = 0; i < CS_CACHE_SIZE; i++) + cs->cs_cache[i] = cset_in_hard(cs, i); + + cs->cs_havecache = true; +} + +/* + * cset_invert -- + * Invert the character set. + */ +void +cset_invert(struct cset *cs) +{ + + cs->cs_invert ^= true; + cs->cs_havecache = false; +} + +/* + * cset_addclass -- + * Add a wctype()-style character class to the set, optionally + * inverting it. + */ +bool +cset_addclass(struct cset *cs, wctype_t type, bool invert) +{ + struct csclass *csc; + + csc = malloc(sizeof(*csc)); + if (csc == NULL) + return (false); + csc->csc_type = type; + csc->csc_invert = invert; + csc->csc_next = cs->cs_classes; + cs->cs_classes = csc; + cs->cs_havecache = false; + return (true); +} + +static __inline int +cset_rangecmp(struct csnode *t, wchar_t ch) +{ + + if (ch < t->csn_min) + return (-1); + if (ch > t->csn_max) + return (1); + return (0); +} + +static struct csnode * +cset_splay(struct csnode *t, wchar_t ch) +{ + struct csnode N, *l, *r, *y; + + /* + * Based on public domain code from Sleator. + */ + + assert(t != NULL); + + N.csn_left = N.csn_right = NULL; + l = r = &N; + for (;;) { + if (cset_rangecmp(t, ch) < 0) { + if (t->csn_left != NULL && + cset_rangecmp(t->csn_left, ch) < 0) { + y = t->csn_left; + t->csn_left = y->csn_right; + y->csn_right = t; + t = y; + } + if (t->csn_left == NULL) + break; + r->csn_left = t; + r = t; + t = t->csn_left; + } else if (cset_rangecmp(t, ch) > 0) { + if (t->csn_right != NULL && + cset_rangecmp(t->csn_right, ch) > 0) { + y = t->csn_right; + t->csn_right = y->csn_left; + y->csn_left = t; + t = y; + } + if (t->csn_right == NULL) + break; + l->csn_right = t; + l = t; + t = t->csn_right; + } else + break; + } + l->csn_right = t->csn_left; + r->csn_left = t->csn_right; + t->csn_left = N.csn_right; + t->csn_right = N.csn_left; + return (t); +} + +static struct csnode * +cset_delete(struct csnode *t, wchar_t ch) +{ + struct csnode *x; + + assert(t != NULL); + t = cset_splay(t, ch); + assert(cset_rangecmp(t, ch) == 0); + if (t->csn_left == NULL) + x = t->csn_right; + else { + x = cset_splay(t->csn_left, ch); + x->csn_right = t->csn_right; + } + free(t); + return x; +} diff --git a/text_cmds/tr/cset.h b/text_cmds/tr/cset.h new file mode 100644 index 0000000..ae62d1d --- /dev/null +++ b/text_cmds/tr/cset.h @@ -0,0 +1,74 @@ +/*- + * Copyright (c) 2004 Tim J. Robbins. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.bin/tr/cset.h,v 1.2 2004/07/14 08:35:11 tjr Exp $ + */ + +#ifndef CSET_H +#define CSET_H + +#include +#include +#include + +struct csnode { + wchar_t csn_min; + wchar_t csn_max; + struct csnode *csn_left; + struct csnode *csn_right; +}; + +struct csclass { + wctype_t csc_type; + bool csc_invert; + struct csclass *csc_next; +}; + +struct cset { +#define CS_CACHE_SIZE 256 + bool cs_cache[CS_CACHE_SIZE]; + bool cs_havecache; + struct csclass *cs_classes; + struct csnode *cs_root; + bool cs_invert; +}; + +bool cset_addclass(struct cset *, wctype_t, bool); +struct cset * cset_alloc(void); +bool cset_add(struct cset *, wchar_t); +void cset_invert(struct cset *); +bool cset_in_hard(struct cset *, wchar_t); +void cset_cache(struct cset *); + +static __inline bool +cset_in(struct cset *cs, wchar_t ch) +{ + + if (ch < CS_CACHE_SIZE && cs->cs_havecache) + return (cs->cs_cache[ch]); + return (cset_in_hard(cs, ch)); +} + +#endif /* CSET_H */ diff --git a/text_cmds/tr/extern.h b/text_cmds/tr/extern.h new file mode 100644 index 0000000..adc9ec3 --- /dev/null +++ b/text_cmds/tr/extern.h @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)extern.h 8.1 (Berkeley) 6/6/93 + * $FreeBSD: src/usr.bin/tr/extern.h,v 1.9 2004/07/09 02:08:07 tjr Exp $ + */ + +#include + +#define NCHARS_SB (UCHAR_MAX + 1) /* Number of single-byte characters. */ +#define OOBCH -1 /* Out of band character value. */ + +typedef struct { + enum { STRING1, STRING2 } which; + enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, + CCLASS, CCLASS_UPPER, CCLASS_LOWER, SET } state; + int cnt; /* character count */ + wint_t lastch; /* last character */ + wctype_t cclass; /* character class from wctype() */ + wint_t equiv[NCHARS_SB]; /* equivalence set */ + wint_t *set; /* set of characters */ + char *str; /* user's string */ +} STR; + +wint_t next(STR *); +int charcoll(const void *, const void *); diff --git a/text_cmds/tr/str.c b/text_cmds/tr/str.c new file mode 100644 index 0000000..e7e6757 --- /dev/null +++ b/text_cmds/tr/str.c @@ -0,0 +1,450 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__FBSDID("$FreeBSD: src/usr.bin/tr/str.c,v 1.24 2004/11/14 05:15:25 jkh Exp $"); + +#ifndef lint +static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "extern.h" + +static int backslash(STR *, int *); +static int bracket(STR *); +static void genclass(STR *); +static void genequiv(STR *); +static int genrange(STR *, int); +static void genseq(STR *); + +/* + * Using libc internal function __collate_lookup_l for character + * equivalence + */ +void __collate_lookup_l(const __darwin_wchar_t *, int *, int *, int *, +locale_t); +/* + * Cache for primary collation weight of each single byte character + * used in static void genequiv(s) + */ +int collation_weight_cache[NCHARS_SB]; +int is_weight_cached = 0; + +wint_t +next(s) + STR *s; +{ + int is_octal; + wint_t ch; + wchar_t wch; + size_t clen; + + switch (s->state) { + case EOS: + return (0); + case INFINITE: +#ifdef __APPLE__ + switch (ch = (u_char)*s->str) { + case '\0': + /* + * force at least one postive return so setup() will + * process lastch of a sequence like [a*]; but change + * state so it won't get stuck in a while(next(s)) loop + */ + s->state = NORMAL; + } +#endif /* __APPLE__ */ + return (1); + case NORMAL: + switch (*s->str) { + case '\0': + s->state = EOS; + return (0); + case '\\': + s->lastch = backslash(s, &is_octal); + break; + case '[': + if (bracket(s)) + return (next(s)); + /* FALLTHROUGH */ + default: + clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2 || + clen == 0) + errc(1, EILSEQ, NULL); + is_octal = 0; + s->lastch = wch; + s->str += clen; + break; + } + + /* We can start a range at any time. */ + if (s->str[0] == '-' && genrange(s, is_octal)) + return (next(s)); + return (1); + case RANGE: + if (s->cnt-- == 0) { + s->state = NORMAL; + return (next(s)); + } + ++s->lastch; + return (1); + case SEQUENCE: + if (s->cnt-- == 0) { + s->state = NORMAL; + return (next(s)); + } + return (1); + case CCLASS: + case CCLASS_UPPER: + case CCLASS_LOWER: + s->cnt++; + ch = nextwctype(s->lastch, s->cclass); + if (ch == -1) { + s->state = NORMAL; + return (next(s)); + } + s->lastch = ch; + return (1); + case SET: + if ((ch = s->set[s->cnt++]) == OOBCH) { + s->state = NORMAL; + return (next(s)); + } + s->lastch = ch; + return (1); + default: + return (0); + } + /* NOTREACHED */ +} + +static int +bracket(s) + STR *s; +{ + char *p; + + switch (s->str[1]) { + case ':': /* "[:class:]" */ + if ((p = strchr(s->str + 2, ']')) == NULL) + return (0); + if (*(p - 1) != ':' || p - s->str < 4) + goto repeat; + *(p - 1) = '\0'; + s->str += 2; + genclass(s); + s->str = p + 1; + return (1); + case '=': /* "[=equiv=]" */ + if ((p = strchr(s->str + 2, ']')) == NULL) + return (0); + if (*(p - 1) != '=' || p - s->str < 4) + goto repeat; + s->str += 2; + genequiv(s); + return (1); + default: /* "[\###*n]" or "[#*n]" */ + repeat: + if ((p = strpbrk(s->str + 2, "*]")) == NULL) + return (0); + if (p[0] != '*' || index(p, ']') == NULL) + return (0); + s->str += 1; + genseq(s); + return (1); + } + /* NOTREACHED */ +} + +static void +genclass(s) + STR *s; +{ + + if ((s->cclass = wctype(s->str)) == 0) + errx(1, "unknown class %s", s->str); + s->cnt = 0; + s->lastch = -1; /* incremented before check in next() */ + if (strcmp(s->str, "upper") == 0) + s->state = CCLASS_UPPER; + else if (strcmp(s->str, "lower") == 0) + s->state = CCLASS_LOWER; + else + s->state = CCLASS; +} + +static void +genequiv(s) + STR *s; +{ + int i, p; + size_t clen; + wchar_t wc; + + if (*s->str == '\\') { + s->equiv[0] = backslash(s, NULL); + if (*s->str != '=') + errx(1, "misplaced equivalence equals sign"); + s->str += 2; + } else { + clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0) + errc(1, EILSEQ, NULL); + s->equiv[0] = wc; + if (s->str[clen] != '=') + errx(1, "misplaced equivalence equals sign"); + s->str += clen + 2; + } + + /* + * Partially supporting multi-byte locales; only finds equivalent + * characters within the first NCHARS_SB entries of the + * collation table + */ + int tprim, tsec; + int len; + __collate_lookup_l(s->equiv, &len, &tprim, &tsec, LC_GLOBAL_LOCALE); + + if (tprim != -1) { + for (p = 1, i = 1; i < NCHARS_SB; i++) { + int cprim; + if (is_weight_cached) { + /* + * retrieve primary weight from cache + */ + cprim = collation_weight_cache[i]; + } else { + /* + * perform lookup of primary weight and fill cache + */ + int csec; + __collate_lookup_l((__darwin_wchar_t *)&i, &len, &cprim, &csec, LC_GLOBAL_LOCALE); + collation_weight_cache[i] = cprim; + } + + /* + * If a character does not exist in the collation + * table, just skip it + */ + if (cprim == -1) { + continue; + } + + /* + * Only compare primary weights to determine multi-byte + * character equivalence + */ + if (cprim == tprim) { + s->equiv[p++] = i; + } + } + s->equiv[p] = OOBCH; + + if (!is_weight_cached) { + is_weight_cached = 1; + } + } + + s->cnt = 0; + s->state = SET; + s->set = s->equiv; +} + +static int +genrange(STR *s, int was_octal) +{ + int stopval, octal; + char *savestart; + int n, cnt, *p; + size_t clen; + wchar_t wc; + + octal = 0; + savestart = s->str; + if (*++s->str == '\\') + stopval = backslash(s, &octal); + else { + clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2) + errc(1, EILSEQ, NULL); + stopval = wc; + s->str += clen; + } + /* + * XXX Characters are not ordered according to collating sequence in + * multibyte locales. + */ + if (octal || was_octal || MB_CUR_MAX > 1) { + if (stopval < s->lastch) { + s->str = savestart; + return (0); + } + s->cnt = stopval - s->lastch + 1; + s->state = RANGE; + --s->lastch; + return (1); + } + if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { + s->str = savestart; + return (0); + } + if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) + err(1, "genrange() malloc"); + for (cnt = 0; cnt < NCHARS_SB; cnt++) + if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && + charcoll((const void *)&cnt, (const void *)&stopval) <= 0) + *p++ = cnt; + *p = OOBCH; + n = p - s->set; + + s->cnt = 0; + s->state = SET; + if (n > 1) + mergesort(s->set, n, sizeof(*(s->set)), charcoll); + return (1); +} + +static void +genseq(s) + STR *s; +{ + char *ep; + wchar_t wc; + size_t clen; + +#ifndef __APPLE__ + if (s->which == STRING1) + errx(1, "sequences only valid in string2"); +#endif /* !__APPLE__ */ + + if (*s->str == '\\') + s->lastch = backslash(s, NULL); + else { + clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2) + errc(1, EILSEQ, NULL); + s->lastch = wc; + s->str += clen; + } + if (*s->str != '*') + errx(1, "misplaced sequence asterisk"); + + switch (*++s->str) { + case '\\': + s->cnt = backslash(s, NULL); + break; + case ']': + s->cnt = 0; + ++s->str; + break; + default: + if (isdigit((u_char)*s->str)) { + s->cnt = strtol(s->str, &ep, 0); + if (*ep == ']') { + s->str = ep + 1; + break; + } + } + errx(1, "illegal sequence count"); + /* NOTREACHED */ + } + + s->state = s->cnt ? SEQUENCE : INFINITE; +} + +/* + * Translate \??? into a character. Up to 3 octal digits, if no digits either + * an escape code or a literal character. + */ +static int +backslash(STR *s, int *is_octal) +{ + int ch, cnt, val; + + if (is_octal != NULL) + *is_octal = 0; + for (cnt = val = 0;;) { + ch = (u_char)*++s->str; + if (!isdigit(ch) || ch > '7') + break; + val = val * 8 + ch - '0'; + if (++cnt == 3) { + ++s->str; + break; + } + } + if (cnt) { + if (is_octal != NULL) + *is_octal = 1; + return (val); + } + if (ch != '\0') + ++s->str; + switch (ch) { + case 'a': /* escape characters */ + return ('\7'); + case 'b': + return ('\b'); + case 'f': + return ('\f'); + case 'n': + return ('\n'); + case 'r': + return ('\r'); + case 't': + return ('\t'); + case 'v': + return ('\13'); + case '\0': /* \" -> \ */ + s->state = EOS; + return ('\\'); + default: /* \x" -> x */ + return (ch); + } +} diff --git a/text_cmds/tr/tr.1 b/text_cmds/tr/tr.1 new file mode 100644 index 0000000..6513674 --- /dev/null +++ b/text_cmds/tr/tr.1 @@ -0,0 +1,420 @@ +.\" Copyright (c) 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" This code is derived from software contributed to Berkeley by +.\" the Institute of Electrical and Electronics Engineers, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)tr.1 8.1 (Berkeley) 6/6/93 +.\" $FreeBSD: src/usr.bin/tr/tr.1,v 1.33 2005/02/13 23:45:51 ru Exp $ +.\" +.Dd July 23, 2004 +.Dt TR 1 +.Os +.Sh NAME +.Nm tr +.Nd translate characters +.Sh SYNOPSIS +.Nm +.Op Fl Ccsu +.Ar string1 string2 +.Nm +.Op Fl Ccu +.Fl d +.Ar string1 +.Nm +.Op Fl Ccu +.Fl s +.Ar string1 +.Nm +.Op Fl Ccu +.Fl ds +.Ar string1 string2 +.Sh DESCRIPTION +The +.Nm +utility copies the standard input to the standard output with substitution +or deletion of selected characters. +.Pp +The following options are available: +.Bl -tag -width Ds +.It Fl C +Complement the set of characters in +.Ar string1 , +that is +.Dq Fl C Li ab +includes every character except for +.Ql a +and +.Ql b . +.It Fl c +Same as +.Fl C +but complement the set of values in +.Ar string1 . +.It Fl d +Delete characters in +.Ar string1 +from the input. +.It Fl s +Squeeze multiple occurrences of the characters listed in the last +operand (either +.Ar string1 +or +.Ar string2 ) +in the input into a single instance of the character. +This occurs after all deletion and translation is completed. +.It Fl u +Guarantee that any output is unbuffered. +.El +.Pp +In the first synopsis form, the characters in +.Ar string1 +are translated into the characters in +.Ar string2 +where the first character in +.Ar string1 +is translated into the first character in +.Ar string2 +and so on. +If +.Ar string1 +is longer than +.Ar string2 , +the last character found in +.Ar string2 +is duplicated until +.Ar string1 +is exhausted. +.Pp +In the second synopsis form, the characters in +.Ar string1 +are deleted from the input. +.Pp +In the third synopsis form, the characters in +.Ar string1 +are compressed as described for the +.Fl s +option. +.Pp +In the fourth synopsis form, the characters in +.Ar string1 +are deleted from the input, and the characters in +.Ar string2 +are compressed as described for the +.Fl s +option. +.Pp +The following conventions can be used in +.Ar string1 +and +.Ar string2 +to specify sets of characters: +.Bl -tag -width [:equiv:] +.It character +Any character not described by one of the following conventions +represents itself. +.It \eoctal +A backslash followed by 1, 2 or 3 octal digits represents a character +with that encoded value. +To follow an octal sequence with a digit as a character, left zero-pad +the octal sequence to the full 3 octal digits. +.It \echaracter +A backslash followed by certain special characters maps to special +values. +.Pp +.Bl -column "\ea" +.It "\ea +.It "\eb +.It "\ef +.It "\en +.It "\er +.It "\et +.It "\ev +.El +.Pp +A backslash followed by any other character maps to that character. +.It c-c +For non-octal range endpoints +represents the range of characters between the range endpoints, inclusive, +in ascending order, +as defined by the collation sequence. +If either or both of the range endpoints are octal sequences, it +represents the range of specific coded values between the +range endpoints, inclusive. +.Pp +.Bf Em +See the +.Sx COMPATIBILITY +section below for an important note regarding +differences in the way the current +implementation interprets range expressions differently from +previous implementations. +.Ef +.It [:class:] +Represents all characters belonging to the defined character class. +Class names are: +.Pp +.Bl -column "phonogram" +.It "alnum +.It "alpha +.It "blank +.It "cntrl +.It "digit +.It "graph +.It "ideogram +.It "lower +.It "phonogram +.It "print +.It "punct +.It "rune +.It "space +.It "special +.It "upper +.It "xdigit +.El +.Pp +.\" All classes may be used in +.\" .Ar string1 , +.\" and in +.\" .Ar string2 +.\" when both the +.\" .Fl d +.\" and +.\" .Fl s +.\" options are specified. +.\" Otherwise, only the classes ``upper'' and ``lower'' may be used in +.\" .Ar string2 +.\" and then only when the corresponding class (``upper'' for ``lower'' +.\" and vice-versa) is specified in the same relative position in +.\" .Ar string1 . +.\" .Pp +When +.Dq Li [:lower:] +appears in +.Ar string1 +and +.Dq Li [:upper:] +appears in the same relative position in +.Ar string2 , +it represents the characters pairs from the +.Dv toupper +mapping in the +.Ev LC_CTYPE +category of the current locale. +When +.Dq Li [:upper:] +appears in +.Ar string1 +and +.Dq Li [:lower:] +appears in the same relative position in +.Ar string2 , +it represents the characters pairs from the +.Dv tolower +mapping in the +.Ev LC_CTYPE +category of the current locale. +.Pp +With the exception of case conversion, +characters in the classes are in unspecified order. +.Pp +For specific information as to which +.Tn ASCII +characters are included +in these classes, see +.Xr ctype 3 +and related manual pages. +.It [=equiv=] +Represents all characters belonging to the same equivalence class as +.Ar equiv , +ordered by their encoded values. +.It [#*n] +Represents +.Ar n +repeated occurrences of the character represented by +.Ar # . +This +expression is only valid when it occurs in +.Ar string2 . +If +.Ar n +is omitted or is zero, it is be interpreted as large enough to extend +.Ar string2 +sequence to the length of +.Ar string1 . +If +.Ar n +has a leading zero, it is interpreted as an octal value, otherwise, +it is interpreted as a decimal value. +.El +.Sh ENVIRONMENT +The +.Ev LANG , LC_ALL , LC_CTYPE +and +.Ev LC_COLLATE +environment variables affect the execution of +.Nm +as described in +.Xr environ 7 . +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +The following examples are shown as given to the shell: +.Pp +Create a list of the words in file1, one per line, where a word is taken to +be a maximal string of letters. +.Pp +.D1 Li "tr -cs \*q[:alpha:]\*q \*q\en\*q < file1" +.Pp +Translate the contents of file1 to upper-case. +.Pp +.D1 Li "tr \*q[:lower:]\*q \*q[:upper:]\*q < file1" +.Pp +(This should be preferred over the traditional +.Ux +idiom of +.Dq Li "tr a-z A-Z" , +since it works correctly in all locales.) +.Pp +Strip out non-printable characters from file1. +.Pp +.D1 Li "tr -cd \*q[:print:]\*q < file1" +.Pp +Remove diacritical marks from all accented variants of the letter +.Ql e : +.Pp +.Dl "tr \*q[=e=]\*q \*qe\*q" +.Sh COMPATIBILITY +Previous +.Fx +implementations of +.Nm +did not order characters in range expressions according to the current +locale's collation order, making it possible to convert unaccented Latin +characters (esp.\& as found in English text) from upper to lower case using +the traditional +.Ux +idiom of +.Dq Li "tr A-Z a-z" . +Since +.Nm +now obeys the locale's collation order, this idiom may not produce +correct results when there is not a 1:1 mapping between lower and +upper case, or when the order of characters within the two cases differs. +As noted in the +.Sx EXAMPLES +section above, the character class expressions +.Dq Li [:lower:] +and +.Dq Li [:upper:] +should be used instead of explicit character ranges like +.Dq Li a-z +and +.Dq Li A-Z . +.Pp +System V has historically implemented character ranges using the syntax +.Dq Li [c-c] +instead of the +.Dq Li c-c +used by historic +.Bx +implementations and +standardized by POSIX. +System V shell scripts should work under this implementation as long as +the range is intended to map in another range, i.e., the command +.Dq Li "tr [a-z] [A-Z]" +will work as it will map the +.Ql \&[ +character in +.Ar string1 +to the +.Ql \&[ +character in +.Ar string2 . +However, if the shell script is deleting or squeezing characters as in +the command +.Dq Li "tr -d [a-z]" , +the characters +.Ql \&[ +and +.Ql \&] +will be +included in the deletion or compression list which would not have happened +under a historic System V implementation. +Additionally, any scripts that depended on the sequence +.Dq Li a-z +to +represent the three characters +.Ql a , +.Ql \- +and +.Ql z +will have to be +rewritten as +.Dq Li a\e-z . +.Pp +The +.Nm +utility has historically not permitted the manipulation of NUL bytes in +its input and, additionally, stripped NUL's from its input stream. +This implementation has removed this behavior as a bug. +.Pp +The +.Nm +utility has historically been extremely forgiving of syntax errors, +for example, the +.Fl c +and +.Fl s +options were ignored unless two strings were specified. +This implementation will not permit illegal syntax. +.Sh STANDARDS +The +.Nm +utility conforms to +.St -p1003.1-2001 . +.Pp +It should be noted that the feature wherein the last character of +.Ar string2 +is duplicated if +.Ar string2 +has less characters than +.Ar string1 +is permitted by POSIX but is not required. +Shell scripts attempting to be portable to other POSIX systems should use +the +.Dq Li [#*] +convention instead of relying on this behavior. +The +.Fl u +option is an extension to the +.St -p1003.1-2001 +standard. diff --git a/text_cmds/tr/tr.c b/text_cmds/tr/tr.c new file mode 100644 index 0000000..7bd2ca7 --- /dev/null +++ b/text_cmds/tr/tr.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__FBSDID("$FreeBSD: src/usr.bin/tr/tr.c,v 1.24 2005/04/09 14:31:41 stefanf Exp $"); + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1988, 1993\n\ + The Regents of the University of California. All rights reserved.\n"; +#endif + +#ifndef lint +static const char sccsid[] = "@(#)tr.c 8.2 (Berkeley) 5/4/95"; +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cmap.h" +#include "cset.h" +#include "extern.h" + +STR s1 = { STRING1, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL }; +STR s2 = { STRING2, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL }; + +static struct cset *setup(char *, STR *, int, int); +static void usage(void); + +int +main(int argc, char **argv) +{ + static int carray[NCHARS_SB]; + struct cmap *map; + struct cset *delete, *squeeze; + int n, *p; + int Cflag, cflag, dflag, sflag, isstring2; + wint_t ch, cnt, lastch; + + (void)setlocale(LC_ALL, ""); + + Cflag = cflag = dflag = sflag = 0; + while ((ch = getopt(argc, argv, "Ccdsu")) != -1) + switch((char)ch) { + case 'C': + Cflag = 1; + cflag = 0; + break; + case 'c': + cflag = 1; + Cflag = 0; + break; + case 'd': + dflag = 1; + break; + case 's': + sflag = 1; + break; + case 'u': + setbuf(stdout, (char *)NULL); + break; + case '?': + default: + usage(); + } + argc -= optind; + argv += optind; + + switch(argc) { + case 0: + default: + usage(); + /* NOTREACHED */ + case 1: + isstring2 = 0; + if(!argv[0]) usage(); + break; + case 2: + isstring2 = 1; + if(!argv[0] || !argv[1]) usage(); + break; + } + + /* + * tr -ds [-Cc] string1 string2 + * Delete all characters (or complemented characters) in string1. + * Squeeze all characters in string2. + */ + if (dflag && sflag) { + if (!isstring2) + usage(); + + delete = setup(argv[0], &s1, cflag, Cflag); + squeeze = setup(argv[1], &s2, 0, 0); + + for (lastch = OOBCH; (ch = getwchar()) != WEOF;) + if (!cset_in(delete, ch) && + (lastch != ch || !cset_in(squeeze, ch))) { + lastch = ch; + (void)putwchar(ch); + } + if (ferror(stdin)) + err(1, NULL); + exit(0); + } + + /* + * tr -d [-Cc] string1 + * Delete all characters (or complemented characters) in string1. + */ + if (dflag) { + if (isstring2) + usage(); + + delete = setup(argv[0], &s1, cflag, Cflag); + + while ((ch = getwchar()) != WEOF) + if (!cset_in(delete, ch)) + (void)putwchar(ch); + if (ferror(stdin)) + err(1, NULL); + exit(0); + } + + /* + * tr -s [-Cc] string1 + * Squeeze all characters (or complemented characters) in string1. + */ + if (sflag && !isstring2) { + squeeze = setup(argv[0], &s1, cflag, Cflag); + + for (lastch = OOBCH; (ch = getwchar()) != WEOF;) + if (lastch != ch || !cset_in(squeeze, ch)) { + lastch = ch; + (void)putwchar(ch); + } + if (ferror(stdin)) + err(1, NULL); + exit(0); + } + + /* + * tr [-Ccs] string1 string2 + * Replace all characters (or complemented characters) in string1 with + * the character in the same position in string2. If the -s option is + * specified, squeeze all the characters in string2. + */ + if (!isstring2) + usage(); + + map = cmap_alloc(); + if (map == NULL) + err(1, NULL); + squeeze = cset_alloc(); + if (squeeze == NULL) + err(1, NULL); + + s1.str = argv[0]; + + if (Cflag || cflag) { + cmap_default(map, OOBCH); + if ((s2.str = strdup(argv[1])) == NULL) + errx(1, "strdup(argv[1])"); + } else + s2.str = argv[1]; + + if (!next(&s2)) + errx(1, "empty string2"); + + /* + * For -s result will contain only those characters defined + * as the second characters in each of the toupper or tolower + * pairs. + */ + + /* If string2 runs out of characters, use the last one specified. */ + while (next(&s1)) { + again: + if (s1.state == CCLASS_LOWER && + s2.state == CCLASS_UPPER && + s1.cnt == 1 && s2.cnt == 1) { + do { + ch = towupper(s1.lastch); + cmap_add(map, s1.lastch, ch); + if (sflag && iswupper(ch)) + cset_add(squeeze, ch); + if (!next(&s1)) + goto endloop; + } while (s1.state == CCLASS_LOWER && s1.cnt > 1); + /* skip upper set */ + do { + if (!next(&s2)) + break; + } while (s2.state == CCLASS_UPPER && s2.cnt > 1); + goto again; + } else if (s1.state == CCLASS_UPPER && + s2.state == CCLASS_LOWER && + s1.cnt == 1 && s2.cnt == 1) { + do { + ch = towlower(s1.lastch); + cmap_add(map, s1.lastch, ch); + if (sflag && iswlower(ch)) + cset_add(squeeze, ch); + if (!next(&s1)) + goto endloop; + } while (s1.state == CCLASS_UPPER && s1.cnt > 1); + /* skip lower set */ + do { + if (!next(&s2)) + break; + } while (s2.state == CCLASS_LOWER && s2.cnt > 1); + goto again; + } else { + cmap_add(map, s1.lastch, s2.lastch); + if (sflag) + cset_add(squeeze, s2.lastch); + } + (void)next(&s2); + } +endloop: + if (cflag || (Cflag && MB_CUR_MAX > 1)) { + /* + * This is somewhat tricky: since the character set is + * potentially huge, we need to avoid allocating a map + * entry for every character. Our strategy is to set the + * default mapping to the last character of string #2 + * (= the one that gets automatically repeated), then to + * add back identity mappings for characters that should + * remain unchanged. We don't waste space on identity mappings + * for non-characters with the -C option; those are simulated + * in the I/O loop. + */ + s2.str = argv[1]; + s2.state = NORMAL; + for (cnt = 0; cnt < WCHAR_MAX; cnt++) { + if (Cflag && !iswrune(cnt)) + continue; + if (cmap_lookup(map, cnt) == OOBCH) { + if (next(&s2)) + cmap_add(map, cnt, s2.lastch); + if (sflag) + cset_add(squeeze, s2.lastch); + } else + cmap_add(map, cnt, cnt); + if ((s2.state == EOS || s2.state == INFINITE) && + cnt >= cmap_max(map)) + break; + } + cmap_default(map, s2.lastch); + } else if (Cflag) { + for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) { + if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt)) + *p++ = cnt; + else + cmap_add(map, cnt, cnt); + } + n = p - carray; + if (Cflag && n > 1) + (void)mergesort(carray, n, sizeof(*carray), charcoll); + + s2.str = argv[1]; + s2.state = NORMAL; + for (cnt = 0; cnt < n; cnt++) { + (void)next(&s2); + cmap_add(map, carray[cnt], s2.lastch); + /* + * Chars taken from s2 can be different this time + * due to lack of complex upper/lower processing, + * so fill string2 again to not miss some. + */ + if (sflag) + cset_add(squeeze, s2.lastch); + } + } + + cset_cache(squeeze); + cmap_cache(map); + + if (sflag) + for (lastch = OOBCH; (ch = getwchar()) != WEOF;) { + if (!Cflag || iswrune(ch)) + ch = cmap_lookup(map, ch); + if (lastch != ch || !cset_in(squeeze, ch)) { + lastch = ch; + (void)putwchar(ch); + } + } + else + while ((ch = getwchar()) != WEOF) { + if (!Cflag || iswrune(ch)) + ch = cmap_lookup(map, ch); + (void)putwchar(ch); + } + if (ferror(stdin)) + err(1, NULL); + exit (0); +} + +static struct cset * +setup(char *arg, STR *str, int cflag, int Cflag) +{ + struct cset *cs; + + cs = cset_alloc(); + if (cs == NULL) + err(1, NULL); + str->str = arg; + while (next(str)) + cset_add(cs, str->lastch); + if (Cflag) + cset_addclass(cs, wctype("rune"), true); + if (cflag || Cflag) + cset_invert(cs); + cset_cache(cs); + return (cs); +} + +int +charcoll(const void *a, const void *b) +{ + static char sa[2], sb[2]; + + sa[0] = *(const int *)a; + sb[0] = *(const int *)b; + return (strcoll(sa, sb)); +} + +static void +usage(void) +{ + (void)fprintf(stderr, "%s\n%s\n%s\n%s\n", + "usage: tr [-Ccsu] string1 string2", + " tr [-Ccu] -d string1", + " tr [-Ccu] -s string1", + " tr [-Ccu] -ds string1 string2"); + exit(1); +} -- cgit v1.2.3-56-ge451