[Toybox] [patch] add sed

Rob Landley rob at landley.net
Sun Aug 4 23:23:09 PDT 2013


On 07/31/2013 09:07:45 PM, Strake wrote:
> Not yet fully functional, but has s/// and some others.

My first major contribution to busybox was a complete rewrite of its  
sed implementation, making it work in all the configure and build  
scripts of linux from scratch and even getting weird corner bug-for-bug  
gnuism cases working like files that don't end in a newline not having  
a newline added _unless_ you have more lines from another input source  
where you retroactively output a newline before the next line you  
output but _don't_ output that newline if the next output source  
doesn't actuallyhave any matches that produce output. (And then there's  
the fun "what does match end of file mean when you have more than one  
input on the command line"...)

I put a partial version in pending to indicate that I've got this one,  
I just haven't prioritized it because I just got back to Austin on  
thursday and I've been spending all my time cleaning up contributions  
instead of writing new code. Somewhere I have a file of notes from the  
last time I wrote sed with all the questions I had to answer, I need to  
make a test suite out of that...

> -USE_SED(NEWTOY(sed, "irne*f*", TOYFLAG_BIN))
> +USE_SED(NEWTOY(sed, "Ene*f*", TOYFLAG_BIN))
> 
>  config SED
>    bool "sed"
>    default n
>    help
> -    usage: sed [-irn] {command | [-e command]...} [FILE...]
> -
> -    Stream EDitor, transforms text by appling script of command to  
> each line
> -    of input.
> -
> -    -e  Add expression to the command script (if no -e, use first  
> argument)
> -    -i	Modify file in place
> -    -n  No default output (p commands only)
> -    -r  Use extended regular expression syntex
> +    usage: sed ...
>  */

I often write help text before I even write the commands. Help text is  
important.

>  #define FOR_sed
>  #include "toys.h"
>  #include "lib/xregcomp.h"
> +#include <regex.h>

I originally had xregcomp because uClibc let you configure out regex  
support, but A) I never made the build selectively do only parts of  
lib/*.c based on configuration so this doesn't let you build in more  
environments yet, although I could trivially slap an #ifdef around the  
contents of the file, B) posix requires these regex functions, and  
these days I'm leaning towards just having it in toys.h and demanding a  
posix build environment.

The reason I _haven't_ done this yet is I need to look at bionic and  
see what that's got, because being able to build a subset of toybox  
commands against bionic is interesting, and I know that's got holes but  
don't remember what they are. (I need to set up a bionic dev  
environment when I finish switching to my new laptop.)

>  GLOBALS(
> -  struct arg_list *files;
> -  struct arg_list *scripts;
> -
> -  void *commands;
> +  struct arg_list *fArgu, *eArgu;

I tend to leave command line targets one per line, and then have a  
blank line between them and other globals. That way you can immediately  
see what the command line arguments are, and it's easy to adjust them  
if you adjust the option string.

> +  char *patternSpace, *holdSpace;
> +  long n;
>  )
> 
> -// Digested version of what sed commands can actually tell use to do.
> +static wint_t xfgetwc (FILE *f) {
> +  wint_t x = fgetwc (f);
> +  if (x < 0) error_exit ("failed to read");
> +  return x;
> +}

Some sort of wide character support, but not based on TOYBOX_I18N...

> +static int xfgetc (FILE *f) {
> +  int c = fgetc (f);
> +  if (c < 0) error_exit ("failed to read");
> +  return c;
> +}
> +
> +static wint_t fpeekwc (FILE *f) {
> +  wint_t x = fgetwc (f);
> +  ungetwc (x, f);
> +  return x;
> +}
> +
> +static int fpeekc (FILE *f) {
> +  int c = fgetc (f);
> +  ungetc (c, f);
> +  return c;
> +}
> +
> +static void skipSuch (FILE *f, int (*p) (wint_t)) {
> +  wint_t x;
> +  do x = fgetwc (f); while (p (x));
> +  ungetwc (x, f);
> +}
> +
> +static int iswnspace (wint_t x) { return (iswspace (x) && x !=  
> '\n'); }
> +
> +static wchar_t *afgetwswde (FILE *f, wint_t d, wint_t e) {
> +  wchar_t x, *xs;
> +  size_t n = 0;
> +
> +  xs = 0;
> +  for (;;) {
> +    x = fgetwc (f);
> +    if (x < 0 || x == d) break;
> +
> +    if (e && x == e) {
> +      x = fgetwc (f);
> +      if (x < 0) {
> +        if (xs) free (xs);
> +        return 0;
> +      }
> +      if (x != d) {
> +        ungetwc (x, f);
> +        x = e;
> +      }
> +    }
> 
> +    xs = xrealloc (xs, sizeof (wchar_t)*(++n + 1));
> +    xs[n - 1] = x;
> +  }
> +
> +  if (xs) xs[n] = 0;
> +  return xs;
> +}
> 
> -struct sed_command {
> -  // double_list compatibility (easier to create in-order)
> -  struct sed_command *next, *prev;
> +static char *wcstoambs (wchar_t *xs) {
> +  char *cs;
> +  size_t n;
> +  n = wcstombs (0, xs, 0);
> +  cs = xmalloc (sizeof (char)*(n + 1));
> +  wcstombs (cs, xs, n + 1);
> +  return cs;
> +}
> 
> -  // data string for (saicytb)
> -  char c, *data;
> -  // Regexes for s/match/data/ and /begin/,/end/command
> -  regex_t *rmatch, *rbegin, *rend;
> -  // For numeric ranges ala 10,20command
> -  long lstart, lstop;
> -  // Which match to replace, 0 for all. s and w commands can write  
> to a file
> -  int which, outfd;
> +static char *afgetswde (FILE *f, wint_t d, wint_t e) {
> +  wchar_t *xs;
> +  char *cs;
> +  cs = 0;
> +  xs = afgetwswde (f, d, e);
> +  if (xs) {
> +    cs = wcstoambs (xs);
> +    free (xs);
> +  }
> +  return cs;
> +}
> +
> +static char *xafgetswde (FILE *f, wint_t d, wint_t e) {
> +  char *cs;
> +  cs = afgetswde (f, d, e);
> +  if (!cs) perror_exit ("failed to read");
> +  return cs;
> +}

So xafgetswde() calls afgetswde() which calls afgetwswde() and  
wcstoambs(). Yeah, that didn't need a comment. And the non-wide  
character version is implemented as a wrapper around the wide character  
version.

> +typedef struct {
> +  char type;
> +  union {
> +    regex_t re;
> +    long n;
> +  };
> +} Address;
> +
> +typedef struct Function {
> +  char x;
> +  union {
> +    struct {
> +      char *writ;
> +      regex_t re;
> +      int flags, fd;
> +      long n, nMatch /* for s function, to pass to regexec */;
> +    };
> +    struct Function *fns;
> +  };
> +} Function;

So this is a named struct _and_ a typedef?

> +typedef struct {
> +  Address s, t;
> +  Function fn;
> +  int active;
> +} Command;

I haven't got any typedefs in the code. When it's a struct I say struct.

> +enum {
> +  sed_gFlag = 1,
> +  sed_pFlag = 2,
> +  sed_wFlag = 4,
>  };

I don't currently have a single enum in the cleaned up codebase:

   grep -w enum *.[hc] toys/{posix,lsb,other}/*.[ch]

When I need a constant, I #define it. I try fairly hard not to need  
them.

> -//  Space. Space. Gotta get past space. Spaaaaaaaace! (But not  
> newline.)
> -static void spaceorb(char **s)
> -{
> -  while (**s == ' ' || **s == '\t') ++*s;
> -}
> -
> -// Parse sed commands
> -
> -static void parse_scripts(void)
> -{
> -  struct arg_list *script;
> -  int which = 0, i;
> -
> -  // Loop through list of scripts collated from command line and/or  
> files
> -
> -  for (script = TT.scripts; script; script = script->next) {
> -    char *str = script->arg;
> -    struct sed_command *cmd;
> -
> -    // we can get multiple commands from a string (semicolons and  
> such)
> -
> -    which++;
> -    for (i=1;;) {
> -      if (!*str) break;
> -
> -      cmd = xzalloc(sizeof(struct sed_command));
> -
> -      // Identify prefix
> -      for (;;) {
> -        spaceorb(&str);
> -        if (*str == '^') {
> -          if (cmd->lstart) goto parse_fail;
> -          cmd->lstart = -1;
> -          str++;
> -          continue;
> -        } else if (*str == '$') {
> -          cmd->lstop = LONG_MAX;
> -          str++;
> -          break;
> -        } else if (isdigit(*str)) {
> -          long ll = strtol(str, &str, 10);
> -
> -          if (ll<0) goto parse_fail;
> -          if (cmd->lstart) {
> -            cmd->lstop = ll;
> -            break;
> -          } else cmd->lstart = ll;
> -        } else if (*str == '/' || *str == '\\') {
> -          // set begin/end
> -          printf("regex\n");
> -          exit(1);
> -        } else if (!cmd->lstart && !cmd->rbegin) break;
> -        else goto parse_fail;  // , with no range after it
> -
> -        spaceorb(&str);
> -        if (*str != ',') break;
> -        str++;

Not extending the code that was there, just zapping it all and  
replacing it with something completely unrelated.

> +static Command *cs;
> +
> +static long      parseNumber    (FILE *);
> +static Address   parseAddress   (FILE *);
> +static Function  parseFunction  (FILE *);
> +static Function *parseFunctions (FILE *);
> +static Command   parseCommand   (FILE *);

Globals outside GLOBALS()

> +/* parse natural number */
> +static long parseNumber (FILE *f) {
> +  long n = 0;
> +  int c;
> +
> +  for (;;) {
> +    c = fgetc (f);
> +    if (!isdigit (c)) {
> +      ungetc (c, f);
> +      return n;
> +    }
> +    n *= 10;
> +    n += c - '0';
> +  }
> +}

Sed reads a line of data into a buffer and operates on it. The posix  
"getline()" can do this, and in posix 2008 can even allocate the buffer  
for you (although the size returned is rounded up to the buffer it  
allocated which is generally more than it actually read so it sucks  
handling embedded NUL bytes; pondering what to do about that). In a  
buffer, you can use strtoul() which tells you where the number ended.

(You can't implement "pattern space" and "hold space" without reading  
the whole line into a buffer. Multiple commands match the same line so  
you process it repeatedly, etc.)

Lemme catch up on my email and get back to this...

Rob
 1375683789.0


More information about the Toybox mailing list