The wc program that I want to implement just counts number of lines (as in number of 0x0a characters) and words (as in, number of non-space characters that follow space characters). It might be not the best program to try optimizations on, but it's what's grabbed my fancy.
An obvious start to me is to avoid expensive routines for reading a single character, like fgetc(). We should read in larger chunks - fread() to a buffer of 16 or 64 KB should be ok.
How to optimize further means pushing my personal limits of understanding - my vague idea is that we want to avoid stalls and conditional execution using lookup tables, and somehow want to exploit data-level and instruction-level parallelism, whatever that means exactly. I thought I'd just drop a little code that I wrote here. The optimization with the precomputed lookup[256] table below brought the user-time down from ~1.05sec to ~0.85 sec in my test. So, it's at least measurable, but not thrilling given that it adds a lot of code.
I hope that someone experienced can comment on it / show how to make it go even faster.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | #include <assert.h> #include <stdio.h> #include <stdlib.h> #include <stdint.h> static const unsigned char spacemap[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, }; static const unsigned char nlmap[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, }; #define ISSP(x) (spacemap[x]) #define ISNL(x) (nlmap[x]) static unsigned char lookup[256]; static unsigned char buf[16*1024]; static void precompute(void) { for (int i = 0; i < 256; i++) { lookup[i] = 0; int wassp = 0; for (int j = 0; j < 8; j++) { int issp = (i >> j) & 1; if (wassp && !issp) lookup[i]++; wassp = issp; } } } int main(int argc, const char **argv) { precompute(); if (argc != 2) { fprintf(stderr, "Usage: ./test FILE\n"); exit(1); } const char *filepath = argv[1]; FILE *f = fopen(filepath, "rb"); if (f == NULL) { fprintf(stderr, "Failed to open file '%s'\n", filepath); exit(1); } uint64_t numBytes = 0; uint64_t numWords = 0; uint64_t numLines = 0; uint64_t wasSp = 1; for (;;) { size_t ret = fread(buf, 1, sizeof buf, f); if (ret == 0) break; numBytes += ret; if (ret != sizeof buf) { for (size_t i = 0; i < ret; i++) { unsigned char c = buf[i]; if (wasSp) numWords += !ISSP(c); numLines += ISNL(c); wasSp = ISSP(c); } break; } for (size_t i = 0; i < sizeof buf / 8; i++) { uint64_t x = ((uint64_t *) buf)[i]; #if 1 uint64_t a0 = (x >> 0) & 0xff; uint64_t a1 = (x >> 8) & 0xff; uint64_t a2 = (x >> 16) & 0xff; uint64_t a3 = (x >> 24) & 0xff; uint64_t a4 = (x >> 32) & 0xff; uint64_t a5 = (x >> 40) & 0xff; uint64_t a6 = (x >> 48) & 0xff; uint64_t a7 = (x >> 56) & 0xff; numLines += ISNL(a0); numLines += ISNL(a1); numLines += ISNL(a2); numLines += ISNL(a3); numLines += ISNL(a4); numLines += ISNL(a5); numLines += ISNL(a6); numLines += ISNL(a7); if (wasSp) numWords += !ISSP(a0); wasSp = ISSP(a7); uint64_t spaces = (ISSP(a0) << 0) | (ISSP(a1) << 1) | (ISSP(a2) << 2) | (ISSP(a3) << 3) | (ISSP(a4) << 4) | (ISSP(a5) << 5) | (ISSP(a6) << 6) | (ISSP(a7) << 7); numWords += lookup[spaces]; #else if (wasSp) numWords += !ISSP(x&0xff); wasSp = ISSP((x>>56)&0xff); uint64_t spaces = 0; for (int j = 0; j < 8; j++) { spaces |= ISSP(x & 0xff) << j; numLines += ISNL(x & 0xff); x >>= 8; } numWords += lookup[spaces]; #endif } } if (ferror(f)) { fprintf(stderr, "I/O error when reading '%s'\n", filepath); exit(1); } assert(feof(f)); fprintf(stderr, "lines, words, bytes: %zu, %zu, %zu\n", numLines, numWords, numBytes); fclose(f); return 0; } |