[Toybox] [patch] add sed

Strake strake888 at gmail.com
Wed Jul 31 19:07:45 PDT 2013


Not yet fully functional, but has s/// and some others.

>From ded73ac6c00754ec9715ede475b328a48561ad86 Mon Sep 17 00:00:00 2001
From: Strake <strake888 at gmail.com>
Date: Wed, 31 Jul 2013 20:41:31 -0500
Subject: add sed

---
 lib/lib.h          |   7 +-
 lib/pending.c      |  24 ++
 toys/pending/sed.c | 629 ++++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 555 insertions(+), 105 deletions(-)

diff --git a/lib/lib.h b/lib/lib.h
index 98f4aad..a6b185e 100644
--- a/lib/lib.h
+++ b/lib/lib.h
@@ -107,6 +107,7 @@ int xopen(char *path, int flags);
 void xclose(int fd);
 int xdup(int fd);
 FILE *xfopen(char *path, char *mode);
+FILE *xfmemopen(void *, size_t, char *);
 size_t xread(int fd, void *buf, size_t len);
 void xreadall(int fd, void *buf, size_t len);
 void xwrite(int fd, void *buf, size_t len);
@@ -199,5 +200,7 @@ char* make_human_readable(unsigned long long size,
unsigned long unit);
 unsigned long get_int_value(const char *numstr, unsigned lowrange,
unsigned highrange);

 // grep helper functions
-char  *astrcat (char *, char *);
-char *xastrcat (char *, char *);
+char   *astrcat  (char *, char *);
+char  *xastrcat  (char *, char *);
+char  *astrncat0 (char *, char *, size_t);
+char *xastrncat0 (char *, char *, size_t);
diff --git a/lib/pending.c b/lib/pending.c
index fad1c65..099fee4 100644
--- a/lib/pending.c
+++ b/lib/pending.c
@@ -102,3 +102,27 @@ char *xastrcat (char *x, char *y) {
   if (!x) error_exit ("xastrcat");
   return x;
 }
+
+char *astrncat0 (char *x, char *y, size_t n) {
+  char *z;
+  size_t m = x ? strlen (x) : 0;
+  z = x;
+  x = realloc (x, m + n + 1);
+  if (!x) return 0;
+  (z ? strncat : strncpy) (x, y, n);
+  x[m + n] = 0;
+  return x;
+}
+
+char *xastrncat0 (char *x, char *y, size_t n) {
+  x = astrncat0 (x, y, n);
+  if (!x) error_exit ("xastrncat");
+  return x;
+}
+
+FILE *xfmemopen (void *x, size_t n, char *mode) {
+  FILE *f;
+  f = fmemopen (x, n, mode);
+  if (!f) perror_exit ("xfmemopen");
+  return f;
+}
diff --git a/toys/pending/sed.c b/toys/pending/sed.c
index 0ce25ac..e0787e4 100644
--- a/toys/pending/sed.c
+++ b/toys/pending/sed.c
@@ -1,138 +1,561 @@
-/* sed.c - Stream editor.
+/* sed.c - stream editor
  *
- * Copyright 2012 Rob Landley <rob at landley.net>
+ * Copyright 2013 CE Strake <strake888 at gmail.com>
  *
- * See http://opengroup.org/onlinepubs/9699919799/utilities/sed.c
+ * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/
+ * See http://refspecs.linuxfoundation.org/LSB_4.1.0/LSB-Core-generic/LSB-Core-generic/cmdbehav.html

-USE_SED(NEWTOY(sed, "irne*f*", TOYFLAG_BIN))
+USE_SED(NEWTOY(sed, "Ene*f*", TOYFLAG_BIN))

 config SED
   bool "sed"
   default n
   help
-    usage: sed [-irn] {command | [-e command]...} [FILE...]
-
-    Stream EDitor, transforms text by appling script of command to each line
-    of input.
-
-    -e  Add expression to the command script (if no -e, use first argument)
-    -i	Modify file in place
-    -n  No default output (p commands only)
-    -r  Use extended regular expression syntex
+    usage: sed ...
 */

 #define FOR_sed
 #include "toys.h"
 #include "lib/xregcomp.h"
+#include <regex.h>

 GLOBALS(
-  struct arg_list *files;
-  struct arg_list *scripts;
-
-  void *commands;
+  struct arg_list *fArgu, *eArgu;
+  char *patternSpace, *holdSpace;
+  long n;
 )

-// Digested version of what sed commands can actually tell use to do.
+static wint_t xfgetwc (FILE *f) {
+  wint_t x = fgetwc (f);
+  if (x < 0) error_exit ("failed to read");
+  return x;
+}
+
+static int xfgetc (FILE *f) {
+  int c = fgetc (f);
+  if (c < 0) error_exit ("failed to read");
+  return c;
+}
+
+static wint_t fpeekwc (FILE *f) {
+  wint_t x = fgetwc (f);
+  ungetwc (x, f);
+  return x;
+}
+
+static int fpeekc (FILE *f) {
+  int c = fgetc (f);
+  ungetc (c, f);
+  return c;
+}
+
+static void skipSuch (FILE *f, int (*p) (wint_t)) {
+  wint_t x;
+  do x = fgetwc (f); while (p (x));
+  ungetwc (x, f);
+}
+
+static int iswnspace (wint_t x) { return (iswspace (x) && x != '\n'); }
+
+static wchar_t *afgetwswde (FILE *f, wint_t d, wint_t e) {
+  wchar_t x, *xs;
+  size_t n = 0;
+
+  xs = 0;
+  for (;;) {
+    x = fgetwc (f);
+    if (x < 0 || x == d) break;
+
+    if (e && x == e) {
+      x = fgetwc (f);
+      if (x < 0) {
+        if (xs) free (xs);
+        return 0;
+      }
+      if (x != d) {
+        ungetwc (x, f);
+        x = e;
+      }
+    }

+    xs = xrealloc (xs, sizeof (wchar_t)*(++n + 1));
+    xs[n - 1] = x;
+  }
+
+  if (xs) xs[n] = 0;
+  return xs;
+}

-struct sed_command {
-  // double_list compatibility (easier to create in-order)
-  struct sed_command *next, *prev;
+static char *wcstoambs (wchar_t *xs) {
+  char *cs;
+  size_t n;
+  n = wcstombs (0, xs, 0);
+  cs = xmalloc (sizeof (char)*(n + 1));
+  wcstombs (cs, xs, n + 1);
+  return cs;
+}

-  // data string for (saicytb)
-  char c, *data;
-  // Regexes for s/match/data/ and /begin/,/end/command
-  regex_t *rmatch, *rbegin, *rend;
-  // For numeric ranges ala 10,20command
-  long lstart, lstop;
-  // Which match to replace, 0 for all. s and w commands can write to a file
-  int which, outfd;
+static char *afgetswde (FILE *f, wint_t d, wint_t e) {
+  wchar_t *xs;
+  char *cs;
+  cs = 0;
+  xs = afgetwswde (f, d, e);
+  if (xs) {
+    cs = wcstoambs (xs);
+    free (xs);
+  }
+  return cs;
+}
+
+static char *xafgetswde (FILE *f, wint_t d, wint_t e) {
+  char *cs;
+  cs = afgetswde (f, d, e);
+  if (!cs) perror_exit ("failed to read");
+  return cs;
+}
+
+typedef struct {
+  char type;
+  union {
+    regex_t re;
+    long n;
+  };
+} Address;
+
+typedef struct Function {
+  char x;
+  union {
+    struct {
+      char *writ;
+      regex_t re;
+      int flags, fd;
+      long n, nMatch /* for s function, to pass to regexec */;
+    };
+    struct Function *fns;
+  };
+} Function;
+
+typedef struct {
+  Address s, t;
+  Function fn;
+  int active;
+} Command;
+
+enum {
+  sed_gFlag = 1,
+  sed_pFlag = 2,
+  sed_wFlag = 4,
 };

-//  Space. Space. Gotta get past space. Spaaaaaaaace! (But not newline.)
-static void spaceorb(char **s)
-{
-  while (**s == ' ' || **s == '\t') ++*s;
-}
-
-// Parse sed commands
-
-static void parse_scripts(void)
-{
-  struct arg_list *script;
-  int which = 0, i;
-
-  // Loop through list of scripts collated from command line and/or files
-
-  for (script = TT.scripts; script; script = script->next) {
-    char *str = script->arg;
-    struct sed_command *cmd;
-
-    // we can get multiple commands from a string (semicolons and such)
-
-    which++;
-    for (i=1;;) {
-      if (!*str) break;
-
-      cmd = xzalloc(sizeof(struct sed_command));
-
-      // Identify prefix
-      for (;;) {
-        spaceorb(&str);
-        if (*str == '^') {
-          if (cmd->lstart) goto parse_fail;
-          cmd->lstart = -1;
-          str++;
-          continue;
-        } else if (*str == '$') {
-          cmd->lstop = LONG_MAX;
-          str++;
-          break;
-        } else if (isdigit(*str)) {
-          long ll = strtol(str, &str, 10);
-
-          if (ll<0) goto parse_fail;
-          if (cmd->lstart) {
-            cmd->lstop = ll;
-            break;
-          } else cmd->lstart = ll;
-        } else if (*str == '/' || *str == '\\') {
-          // set begin/end
-          printf("regex\n");
-          exit(1);
-        } else if (!cmd->lstart && !cmd->rbegin) break;
-        else goto parse_fail;  // , with no range after it
-
-        spaceorb(&str);
-        if (*str != ',') break;
-        str++;
+static Command *cs;
+
+static long      parseNumber    (FILE *);
+static Address   parseAddress   (FILE *);
+static Function  parseFunction  (FILE *);
+static Function *parseFunctions (FILE *);
+static Command   parseCommand   (FILE *);
+
+/* parse natural number */
+static long parseNumber (FILE *f) {
+  long n = 0;
+  int c;
+
+  for (;;) {
+    c = fgetc (f);
+    if (!isdigit (c)) {
+      ungetc (c, f);
+      return n;
+    }
+    n *= 10;
+    n += c - '0';
+  }
+}
+
+static Address parseAddress (FILE *f) {
+  Address a;
+  char *xs;
+  wint_t d;
+
+  d = fgetwc (f);
+  switch (d) {
+  case '\\':
+    d = xfgetwc (f);
+    if (d == '\\' || d == 0) error_exit ("bad delimiter");
+    /* fall thru */
+  case '/':
+    a.type = '/';
+    xs = xafgetswde (f, d, '\\');
+    xregcomp (&a.re, xs, toys.optflags & FLAG_E ? REG_EXTENDED : 0);
+    free (xs);
+    break;
+  case '$':
+    a.type = '$';
+    break;
+  default:
+    ungetwc (d, f);
+    if (iswdigit (d)) {
+      a.type = 'n';
+      a.n = parseNumber (f);
+    }
+    else a.type = 0;
+  }
+
+  return a;
+}
+
+static Function parseFunction (FILE *f) {
+  Function fn;
+  char *xs;
+  wint_t d;
+
+  fn.x = fgetc (f);
+  switch (fn.x) {
+  case '{':
+    fn.x = ';';
+    fn.fns = parseFunctions (f);
+    if (fpeekc (f) != '}') error_exit ("{} mismatch");
+    break;
+  case 'a':
+  case 'c':
+  case 'i':
+    while (fgetc (f) != '\n');
+    fn.writ = xafgetswde (f, '\n', '\\');
+    break;
+  case 'r':
+  case 'w':
+    skipSuch (f, iswnspace);
+    xs = xafgetswde (f, '\n', '\\');
+    fn.fd = xopen (xs, fn.x == 'r' ? O_RDONLY : (O_WRONLY | O_CREAT |
O_TRUNC));
+    free (xs);
+    break;
+  case 's':
+    d = xfgetwc (f);
+    xs      = xafgetswde (f, d, '\\');
+    fn.writ = xafgetswde (f, d, '\\');
+    xregcomp (&fn.re, xs, toys.optflags & FLAG_E ? REG_EXTENDED : 0);
+    fn.nMatch = 256; /* TO DO: unbreak */
+    free (xs);
+    for (;;) {
+      char c = fgetc (f);
+      if (c < 0) break;
+      if (isspace (c) && c != '\n') continue;
+      else switch (c) {
+      case '\n':
+      case ';':
+        goto s_end;
+      case 'g':
+        fn.flags |= sed_gFlag;	
+        break;
+      case 'p':
+        fn.flags |= sed_pFlag;
+        break;
+      case 'w':
+        fn.flags |= sed_wFlag;
+        skipSuch (f, iswnspace);
+        xs = xafgetswde (f, '\n', '\\');
+        fn.fd = xopen (xs, O_APPEND);
+        free (xs);
+        goto s_end;
+      default:
+        error_exit ("bad flag: %c", c);
       }
-      i = stridx("{bcdDgGhHlnNpPstwxyrqia= \t#:}", *str);
-      if (i == -1) goto parse_fail;
+    }
+s_end:
+    break;
+  }
+
+  return fn;
+}
+
+static Command parseCommand (FILE *f) {
+  Command c;
+
+  skipSuch (f, iswnspace);
+
+  {
+    c.s = parseAddress (f);
+  }
+  if (fpeekc (f) == ',') {
+    fgetc (f);
+    c.t = parseAddress (f);
+    if (!c.s.type || !c.t.type) error_exit ("bad address");
+  }
+  else {
+    c.t = (Address){ .type = 0 };
+  }
+
+  skipSuch (f, iswnspace);
+
+  c.fn = parseFunction (f);
+
+  c.active = 0;
+
+  return c;
+}
+
+static Function *parseFunctions (FILE *f) {
+  Function *fns;
+  long n = 0;
+
+  fns = 0;
+  for (;;) {
+    int c;
+
+    skipSuch (f, iswspace);
+
+    fns = xrealloc (fns, sizeof (Function)*(++n + 1));

-      dlist_add_nomalloc((struct double_list **)&TT.commands,
-                         (struct double_list *)cmd);
-      exit(1);
+    c = fpeekc (f);
+    if (c == EOF || c == '}') break;
+
+    fns[n - 1] = parseFunction (f);
+  }
+
+  fns[n] = (Function){ .x = 0 };
+
+  return fns;
+}
+
+static Command *parseCommands (FILE *f) {
+  Command *cs;
+  long n = 0;
+
+  cs = 0;
+  for (;;) {
+    int c;
+
+    skipSuch (f, iswspace);
+
+    cs = xrealloc (cs, sizeof (Command)*(++n + 1));
+
+    c = fpeekc (f);
+    if (c == EOF || c == '}') break;
+
+    cs[n - 1] = parseCommand (f);
+  }
+
+  cs[n] = (Command){ .fn = (Function) { .x = 0 } };
+
+  return cs;
+}
+
+static int inRange (Command c) {
+  if (!c.t.type) switch (c.s.type) {
+    case 'n':
+      return (c.s.n == TT.n);
+    case '/':
+      return (regexec (&c.t.re, TT.patternSpace, 0, 0, 0) == 0);
+    case 0:
+      return 1;
+  }
+  if (c.active) {
+    switch (c.t.type) {
+    case 'n':
+      if (c.t.n >= TT.n) c.active = 0;
+      break;
+    case '/':
+      if (regexec (&c.t.re, TT.patternSpace, 0, 0, 0) == 0) c.active = 0;
+      break;
+    }
+    return 1;
+  }
+  else {
+    switch (c.s.type) {
+    case 'n':
+      if (c.s.n <= TT.n) c.active = 1;
+      break;
+    case '/':
+      if (regexec (&c.s.re, TT.patternSpace, 0, 0, 0) == 0) c.active = 1;
+      break;
     }
+    return c.active;
   }
+}
+
+static void sFn (Function fn) {
+  char *xs, *ys, *p;
+  regmatch_t *ms;
+  long n;
+
+  ms = xmalloc (sizeof (regmatch_t)*fn.nMatch);
+  ys = 0;
+  for (xs = TT.patternSpace;
+       (xs == TT.patternSpace || fn.flags & sed_gFlag) &&
+       regexec (&fn.re, xs, fn.nMatch, ms, 0) == 0;
+       xs += ms[0].rm_eo) {
+    long ii;
+    ys = xastrncat0 (ys, xs, ms[0].rm_so);
+    for (ii = 0; fn.writ[ii]; ii++) switch (fn.writ[ii]) {
+    case '&':
+      ys = xastrncat0 (ys, xs + ms[0].rm_so, ms[0].rm_eo - ms[0].rm_so);
+      break;
+    case '\\':
+      n = strtoul (fn.writ + ++ii, &p, 10);
+      if (p > fn.writ + ii) {
+        ii = p - fn.writ - 1;
+        if (n >= fn.nMatch || ms[n].rm_so < 0) error_msg ("bad
backreference: %d", n);
+        else ys = xastrncat0 (ys, xs + ms[n].rm_so, ms[n].rm_eo - ms[n].rm_so);
+        break;
+      }
+      /* fall thru */
+    default:
+      ys = xastrcat (ys, (char []){ fn.writ[ii], 0 });
+    }
+  }
+  ys = xastrcat (ys, xs);
+  free (TT.patternSpace);
+  TT.patternSpace = ys;
+}

-  return;
+/* return whether to start next cycle */
+static int doCommand (FILE *f, Command c) {
+  char *xs;
+  size_t _;
+
+  if (inRange (c)) switch (c.fn.x) {
+  case 'c':
+    TT.patternSpace[0] = 0;
+    /* fall thru */
+  case 'a':
+  case 'i':
+    fputs (c.fn.writ, stdout);
+    return 0;
+  case 'D':
+    xs = strchr (TT.patternSpace, '\n');
+    if (xs) {
+      memmove (TT.patternSpace, xs + 1, strlen (xs + 1) + 1);
+      return 1;
+    }
+  /* fall thru */
+  case 'd':
+    TT.patternSpace[0] = 0;
+    return 1;
+  case 'g':
+    free (TT.patternSpace);
+    TT.patternSpace = xstrdup (TT.holdSpace);
+    return 0;
+  case 'G':
+    xastrcat (TT.patternSpace, "\n");
+    xastrcat (TT.patternSpace, TT.holdSpace);
+    return 0;
+  case 'h':
+    free (TT.holdSpace);
+    TT.holdSpace = xstrdup (TT.patternSpace);
+    return 0;
+  case 'H':
+    xastrcat (TT.holdSpace, "\n");
+    xastrcat (TT.holdSpace, TT.patternSpace);
+    return 0;
+  case 'n':
+    if (!(toys.optflags & FLAG_n)) fputs (TT.patternSpace, stdout);
+    free (TT.patternSpace);
+    TT.patternSpace = 0;
+    if (getline (&TT.patternSpace, &_, f) < 0) xexit ();
+    return 0;
+  case 'N':
+    xs = 0;
+    if (getline (&xs, &_, f) < 0) xexit ();
+    xastrcat (TT.patternSpace, "\n");
+    xastrcat (TT.patternSpace, xs);
+    free (xs);
+    TT.n++;
+    return 0;
+  case 'p':
+    fputs (TT.patternSpace, stdout);
+    return 0;
+  case 'q':
+    xexit ();
+  case 's':
+    sFn (c.fn);
+    break;
+  case 'x':
+    xs = TT.patternSpace;
+    TT.patternSpace = TT.holdSpace;
+    TT.holdSpace = xs;
+    return 0;
+  case '=':
+    printf ("%ld\n", TT.n);
+    return 0;
+  default:
+    error_exit ("%c function unimplete", c.fn.x);
+  }
+}

-parse_fail:
-  error_exit("bad expression %d@%d: %s", which, i, script->arg+i);
+static void doPreCommands (FILE *f, Command *cs) {
+  long ii;
+  for (ii = 0; cs[ii].fn.x; ii++) if ('i' == cs[ii].fn.x && doCommand
(f, cs[ii])) break;
 }

-void sed_main(void)
-{
-  char **files=toys.optargs;
+static void doCommands (FILE *f, Command *cs) {
+  long ii;
+  for (ii = 0; cs[ii].fn.x; ii++) if ('i' != cs[ii].fn.x && doCommand
(f, cs[ii])) break;
+}

-  // If no -e, use first argument
-  if (!TT.scripts) {
-    if (!*files) error_exit("Need script");
-    (TT.scripts = xzalloc(sizeof(struct arg_list)))->arg = *(files++);
+void do_sed (int fd, char *name) {
+  FILE *f;
+
+  f = fdopen (fd, "r");
+  if (fd < 0) perror_exit ("failed to open %s", name);
+
+  TT.patternSpace = 0;
+  for (;;) {
+    doPreCommands (f, cs);
+    TT.patternSpace = afgetswde (f, '\n', 0);
+    if (!TT.patternSpace) return;
+    TT.n++;
+    doCommands (f, cs);
+    printf ("%s\n", TT.patternSpace);
+    free (TT.patternSpace);
   }
+}
+
+void addCommands (Command *ds) {
+  long m, n;
+  if (!cs) cs = xmalloc (0);
+  for (m = 0; cs[m].fn.x; m++);
+  for (n = 0; ds[n].fn.x; n++);
+  cs = xrealloc (cs, sizeof (Command)*(m + n + 1));
+  memmove (cs + m, ds, sizeof(Command)*(n + 1));
+}
+
+void faddCommands (FILE *f) {
+  Command *ds;
+  ds = parseCommands (f);
+  if (fpeekc (f) == '}') error_exit ("{} mismatch");
+  addCommands (ds);
+  free (ds);
+}

-  parse_scripts();
+void buildScript (void) {
+  FILE *f;
+
+  cs = 0;
+
+  for (; TT.eArgu; TT.eArgu = TT.eArgu -> next) {
+    f = xfmemopen (TT.eArgu -> arg, strlen (TT.eArgu -> arg), "r");
+    faddCommands (f);
+    fclose (f);
+  }
+  for (; TT.fArgu; TT.fArgu = TT.fArgu -> next) {
+    f = xfopen (TT.fArgu -> arg, "r");
+    faddCommands (f);
+    fclose (f);
+  }
+
+  if (!cs) {
+    if (toys.optc < 1) error_exit ("no script");
+    f = xfmemopen (toys.optargs[0], strlen (toys.optargs[0]), "r");
+    faddCommands (f);
+    fclose (f);
+    toys.optc--; toys.optargs++;
+  }
+}

-  while (*files) dprintf(2,"file=%s\n", *(files++));
+void sed_main (void) {
+  buildScript ();
+
+  TT.patternSpace = xmalloc (0);
+  TT.holdSpace    = xmalloc (0);
+
+  loopfiles (toys.optargs, do_sed);
 }
-- 
1.7.11.1


More information about the Toybox mailing list