[Toybox] New tests for dirname and wc

Felix Janda felix.janda at posteo.de
Sun Nov 4 15:04:13 PST 2012


On 11/01/12 at 09:49am, Rob Landley wrote:
> I note that adding utf-8 support to wc might be an interesting small  
> project. It's basically mbrtowc() and possibly with wcswidth() on the  
> result. (I'd have to check the definition of -m to see if they want  
> characters output or character positions output).
> 
> If not, I should get around to it before too long. :)

wc -m only cares about counting characters. Attached is a try on
implementing it and some test cases for it. The test cases are only for
UTF-8 locales.

I think that a config option for internalization support should be added.

> I'm interested in defining what those extensions are, but it's really  
> data collection. I know that I use <(command) and >(command), the  
> {curly,bracket} stuff, and pipefail. Several other things are synonyms:  
> $[1+2] is more or less $((1+2)), saying "function" before a function  
> definition is a NOP...
>
> Aboriginal linux is building bash 2.05b, because last I checked busybox  
> ash couldn't build LFS. (This may have changed, I haven't rechecked in  
> a while.) But most packages I tried didn't need the bash stuff  
> introduced in 3.x or 4.x. Then again, I know this version of bash is  
> too old to run gentoo's portage package manager (which uses newer bash  
> features: some quoting rule changed, and it uses the ~= regex thing).  
> At one point I patched portage to work with older bash, but that's  
> pretty stale.
> 
> I'd like to get toysh to run portage, the aboriginal linux build, and  
> make it through linux from scratch (what are they up to, 7.2? I've got  
> an automated 6.8 build that needs updating...)

LFS is at 7.2. Now with udev from systemd.

Ok, thanks for the elaboration. You don't recall what of LFS required bash
extensions? Now someone just needs to figure out what features of bash
portage uses.

Felix
-------------- next part --------------
diff -r 17692bd604a2 toys/posix/wc.c
--- a/toys/posix/wc.c	Sun Nov 04 16:42:03 2012 +0100
+++ b/toys/posix/wc.c	Sun Nov 04 23:58:50 2012 +0100
@@ -6,22 +6,24 @@
  *
  * See http://opengroup.org/onlinepubs/9699919799/utilities/wc.html
 
-USE_WC(NEWTOY(wc, "cwl", TOYFLAG_USR|TOYFLAG_BIN))
+USE_WC(NEWTOY(wc, "mcwl", TOYFLAG_USR|TOYFLAG_BIN))
 
 config WC
 	bool "wc"
 	default y
 	help
-	  usage: wc -lwc [FILE...]
+	  usage: wc -lwcm [FILE...]
 
 	  Count lines, words, and characters in input.
 
 	  -l	show lines
 	  -w	show words
-	  -c	show characters
+	  -c	show bytes
+	  -m	show characters
 
-	  By default outputs lines, words, characters, and filename for each
-	  argument (or from stdin if none).
+	  By default outputs lines, words, bytes, and filename for each
+	  argument (or from stdin if none). Displays only either bytes
+	  or characters.
 */
 
 #include "toys.h"
@@ -48,7 +50,8 @@
 
 static void do_wc(int fd, char *name)
 {
-	int i, len;
+	int i, len, clen=1, space;
+	wchar_t wchar;
 	unsigned long word=0, lengths[]={0,0,0};
 
 	for (;;) {
@@ -58,9 +61,24 @@
 			toys.exitval = EXIT_FAILURE;
 		}
 		if (len<1) break;
-		for (i=0; i<len; i++) {
+		for (i=0; i<len; i+=clen) {
+			if(toys.optflags&8) {
+				clen = mbrtowc(&wchar, toybuf+i, len-i, 0);
+				if(clen==(size_t)(-1)) {
+					if(i!=len-1) {
+						clen = 1;
+						continue;
+					}
+					else break;
+				}
+				if(clen==(size_t)(-2)) break;
+				if(clen==0) clen=1;
+				space = iswspace(wchar);
+			}
+			else space = isspace(toybuf[i]);
+
 			if (toybuf[i]==10) lengths[0]++;
-			if (isspace(toybuf[i])) word=0;
+			if (space) word=0;
 			else {
 				if (!word) lengths[1]++;
 				word=1;
@@ -74,6 +92,8 @@
 
 void wc_main(void)
 {
+	setlocale(LC_ALL, "");
+	toys.optflags |= (toys.optflags&8)>>1;
 	loopfiles(toys.optargs, do_wc);
 	if (toys.optc>1) show_lengths(TT.totals, "total");
 }
diff -r 17692bd604a2 scripts/test/wc.test
--- a/scripts/test/wc.test	Sun Nov 04 16:42:03 2012 +0100
+++ b/scripts/test/wc.test	Sun Nov 04 23:58:57 2012 +0100
@@ -18,5 +18,29 @@
 testing "wc -l" "wc -l file1" "4 file1\n" "" ""
 testing "wc -w" "wc -w file1" "5 file1\n" "" ""
 testing "wc format" "wc file1" "4 5 26 file1\n" "" ""
-testing "wc multiple files" "wc input - file1" "1 2 3 input\n0 2 3 -\n4 5 26 file1\n5 9 32 total\n" "a\nb" "a b"
+testing "wc multiple files" "wc input - file1" \
+        "1 2 3 input\n0 2 3 -\n4 5 26 file1\n5 9 32 total\n" "a\nb" "a b"
+
+#Tests for wc -m
+if printf "%s" "$LANG" | grep -q UTF-8
+then
+
+printf " " > file1
+for i in $(seq 1 8192)
+do
+  printf "?" >> file1
+done
+testing "wc -m" "wc -m file1" "8193 file1\n" "" ""
+printf " " > file1
+for i in $(seq 1 8192)
+do
+  printf "??" >> file1
+done
+testing "wc -m (invalid chars)" "wc -m file1" "8193 file1\n" "" ""
+testing "wc -mlw" "wc -mlw input" "1 2 11 input\n" "hello, ??!\n" ""
+
+else
+printf "skipping tests for wc -m"
+fi
+
 rm file1
diff -r 17692bd604a2 toys.h
--- a/toys.h	Sun Nov 04 16:42:03 2012 +0100
+++ b/toys.h	Sun Nov 04 23:59:04 2012 +0100
@@ -16,6 +16,7 @@
 #include <inttypes.h>
 #include <limits.h>
 #include <libgen.h>
+#include <locale.h>
 #include <math.h>
 #include <pty.h>
 #include <pwd.h>
@@ -46,6 +47,8 @@
 #include <unistd.h>
 #include <utime.h>
 #include <utmpx.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "lib/lib.h"
 #include "toys/e2fs.h"


More information about the Toybox mailing list