Nigel Tao | b0c7286 | 2018-10-21 16:03:47 +1100 | [diff] [blame^] | 1 | // Copyright 2018 The Wuffs Authors. |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | // +build ignore |
| 16 | |
| 17 | package main |
| 18 | |
| 19 | // print-lzw-example.go prints a worked example discussed in std/lzw/README.md, |
| 20 | // based on three implementations of a simplified core of the LZW algorithm. |
| 21 | // Simplifications include assuming LSB-first, 8 bit literals, no clear codes, |
| 22 | // no full tables and that the input is given in terms of a code stream, not a |
| 23 | // bit stream. |
| 24 | // |
| 25 | // The implementations are designed for ease of studying, and for e.g. a |
| 26 | // minimal diff between the suf1 and sufQ implementations, and are not |
| 27 | // optimized for performance. |
| 28 | // |
| 29 | // Usage: go run print-lzw-example.go |
| 30 | // |
| 31 | // You can also play with the code (e.g. modify Q and re-run) online: |
| 32 | // https://play.golang.org/p/1aLC_XHZzoA |
| 33 | |
| 34 | import ( |
| 35 | "fmt" |
| 36 | ) |
| 37 | |
| 38 | const ( |
| 39 | expectedOutput = "TOBEORNOTTOBEORTOBEORNOTXOTXOTXOOTXOOOTXOOOTOBEY" |
| 40 | |
| 41 | clearCode = 0x100 |
| 42 | endCode = 0x101 |
| 43 | |
| 44 | // Neither clearCode or endCode can ever be a valid prefix code. We |
| 45 | // arbitrarily pick clearCode to represent "no prefix". |
| 46 | noPrefix = clearCode |
| 47 | |
| 48 | // Q is the maximum possible suffix length, in bytes, which equals 1 plus |
| 49 | // the maximum possible "suffix length minus 1". Q == 1 means that sufQ is |
| 50 | // equivalent to suf1. |
| 51 | // |
| 52 | // In std/lzw/README.md, Q is 3 to keep the example short and simple. In |
| 53 | // practice, especially on modern CPUs that can do unaligned 32 or 64 bit |
| 54 | // loads and stores, Q is 4 or 8. |
| 55 | Q = 3 |
| 56 | ) |
| 57 | |
| 58 | var codes = []int{ |
| 59 | 'T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', |
| 60 | 0x102, 0x104, 0x106, 0x10B, 0x105, 0x107, 0x109, |
| 61 | 'X', |
| 62 | 0x111, 0x113, 0x114, 0x115, 0x10E, |
| 63 | 'Y', |
| 64 | 0x101, |
| 65 | } |
| 66 | |
| 67 | func codeString(code int) string { |
| 68 | if code < clearCode { |
| 69 | return fmt.Sprintf("%5c", code) |
| 70 | } else if code == noPrefix { |
| 71 | return " -" |
| 72 | } |
| 73 | return fmt.Sprintf("0x%03X", code) |
| 74 | } |
| 75 | |
| 76 | func main() { |
| 77 | n, s, q := new(naive), new(suf1), new(sufQ) |
| 78 | decode(n) |
| 79 | decode(s) |
| 80 | decode(q) |
| 81 | |
| 82 | key := endCode |
| 83 | fmt.Printf(" Code Emits Key Value Pre1+Suf1 LM1 /Q %%Q PreQ+SufQ\n") |
| 84 | for _, code := range codes { |
| 85 | // Code |
| 86 | fmt.Printf("%s", codeString(code)) |
| 87 | |
| 88 | // Emits |
| 89 | if code == endCode { |
| 90 | fmt.Printf(" -end-\n") |
| 91 | break |
| 92 | } |
| 93 | fmt.Printf(" %6s", n[code]) |
| 94 | |
| 95 | if key != endCode { |
| 96 | fmt.Printf(" 0x%03X %7s %s %c %3d %3d %3d %s %s", |
| 97 | key, // Key |
| 98 | n[key], // Value |
| 99 | codeString(int(s.prefixes[key])), // Pre1 |
| 100 | s.suffixes[key], // Suf1 |
| 101 | q.lengthM1s[key], // LM1 |
| 102 | q.lengthM1s[key]/Q, // /Q |
| 103 | q.lengthM1s[key]%Q, // %Q |
| 104 | codeString(int(q.prefixes[key])), // PreQ |
| 105 | string(q.suffixes[key][:]), // SufQ |
| 106 | ) |
| 107 | } |
| 108 | |
| 109 | fmt.Println() |
| 110 | key++ |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | type implementation interface { |
| 115 | initialize() |
| 116 | emit(buffer []byte, code int, prevCode int, n int) []byte |
| 117 | } |
| 118 | |
| 119 | func decode(impl implementation) { |
| 120 | impl.initialize() |
| 121 | |
| 122 | output, prevCode, n := []byte(nil), -1, 0x101 |
| 123 | for _, code := range codes { |
| 124 | if code == clearCode { |
| 125 | panic("unimplemented clearCode") |
| 126 | } else if code == endCode { |
| 127 | if string(output) != expectedOutput { |
| 128 | panic("unexpected output") |
| 129 | } |
| 130 | return |
| 131 | } else if code > n { |
| 132 | panic("invalid code") |
| 133 | } |
| 134 | |
| 135 | // We have a literal or copy code. |
| 136 | output = impl.emit(output, code, prevCode, n) |
| 137 | |
| 138 | prevCode, n = code, n+1 |
| 139 | if n >= 4096 { |
| 140 | panic("unimplemented table-is-full") |
| 141 | } |
| 142 | } |
| 143 | panic("missing endCode") |
| 144 | } |
| 145 | |
| 146 | // naive is a simple implementation that, in the worst case, requires O(N*N) |
| 147 | // memory. |
| 148 | type naive [4096]string |
| 149 | |
| 150 | func (t *naive) initialize() { |
| 151 | for i := 0; i < clearCode; i++ { |
| 152 | t[i] = string([]byte{byte(i)}) |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | func (t *naive) emit(output []byte, code int, prevCode int, n int) []byte { |
| 157 | if prevCode >= 0 { |
| 158 | prevOutput := t[prevCode] |
| 159 | if code < n { |
| 160 | t[n] = prevOutput + t[code][:1] |
| 161 | } else { |
| 162 | t[n] = prevOutput + prevOutput[:1] |
| 163 | } |
| 164 | } |
| 165 | return append(output, t[code]...) |
| 166 | } |
| 167 | |
| 168 | // suf1 keeps separate prefix and suffix tables, 1 byte per suffix. |
| 169 | type suf1 struct { |
| 170 | prefixes [4096]uint16 |
| 171 | suffixes [4096]byte |
| 172 | } |
| 173 | |
| 174 | func (t *suf1) initialize() { |
| 175 | for i := 0; i < clearCode; i++ { |
| 176 | t.prefixes[i] = noPrefix |
| 177 | t.suffixes[i] = byte(i) |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | func (t *suf1) emit(output []byte, code int, prevCode int, n int) []byte { |
| 182 | // Fill an intermediate buffer from back to front, 1 byte at a time. |
| 183 | buffer := [4096]byte{} |
| 184 | i := len(buffer) |
| 185 | |
| 186 | c := code |
| 187 | if code == n { |
| 188 | c = prevCode |
| 189 | i-- |
| 190 | // buffer[4095] will be set later. |
| 191 | } |
| 192 | |
| 193 | firstByte := byte(0) |
| 194 | for { |
| 195 | suffix := t.suffixes[c] |
| 196 | i-- |
| 197 | buffer[i] = suffix |
| 198 | c = int(t.prefixes[c]) |
| 199 | if c == noPrefix { |
| 200 | firstByte = suffix |
| 201 | break |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | if code == n { |
| 206 | buffer[4095] = firstByte |
| 207 | } |
| 208 | |
| 209 | // The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of |
| 210 | // 0, is correct conceptually, but unnecessary in practice. Look for "fake |
| 211 | // key-value pair" in std/lzw/README.md. |
| 212 | if prevCode >= 0 { |
| 213 | t.prefixes[n] = uint16(prevCode) |
| 214 | t.suffixes[n] = firstByte |
| 215 | } |
| 216 | |
| 217 | return append(output, buffer[i:4096]...) |
| 218 | } |
| 219 | |
| 220 | // sufQ keeps separate prefix and suffix tables, up to Q bytes per suffix. |
| 221 | type sufQ struct { |
| 222 | lengthM1s [4096]uint16 |
| 223 | prefixes [4096]uint16 |
| 224 | suffixes [4096][Q]byte |
| 225 | } |
| 226 | |
| 227 | func (t *sufQ) initialize() { |
| 228 | for i := 0; i < clearCode; i++ { |
| 229 | t.lengthM1s[i] = 0 |
| 230 | t.prefixes[i] = noPrefix |
| 231 | t.suffixes[i][0] = byte(i) |
| 232 | for j := 1; j < Q; j++ { |
| 233 | // Unnecessary for correct output, but makes the printed table prettier. |
| 234 | t.suffixes[i][j] = '.' |
| 235 | } |
| 236 | } |
| 237 | } |
| 238 | |
| 239 | func (t *sufQ) emit(output []byte, code int, prevCode int, n int) []byte { |
| 240 | // Fill an intermediate buffer from back to front, Q bytes at a time. |
| 241 | buffer := [4096 + Q - 1]byte{} |
| 242 | i := len(buffer) |
| 243 | |
| 244 | c := code |
| 245 | if code == n { |
| 246 | c = prevCode |
| 247 | i-- |
| 248 | // buffer[4095] will be set later. |
| 249 | } |
| 250 | |
| 251 | i -= int(t.lengthM1s[c] % Q) |
| 252 | |
| 253 | firstByte := byte(0) |
| 254 | for { |
| 255 | suffix := t.suffixes[c] |
| 256 | i -= Q |
| 257 | copy(buffer[i:i+Q], suffix[:]) |
| 258 | c = int(t.prefixes[c]) |
| 259 | if c == noPrefix { |
| 260 | firstByte = suffix[0] |
| 261 | break |
| 262 | } |
| 263 | } |
| 264 | |
| 265 | if code == n { |
| 266 | buffer[4095] = firstByte |
| 267 | } |
| 268 | |
| 269 | // The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of |
| 270 | // 0, is correct conceptually, but unnecessary in practice. Look for "fake |
| 271 | // key-value pair" in std/lzw/README.md. |
| 272 | if prevCode >= 0 { |
| 273 | lm1 := t.lengthM1s[prevCode] + 1 |
| 274 | t.lengthM1s[n] = lm1 |
| 275 | lm1 %= Q |
| 276 | if lm1 != 0 { |
| 277 | // Copy the prevCode's prefix and suffix, and then extend the suffix. |
| 278 | t.prefixes[n] = t.prefixes[prevCode] |
| 279 | t.suffixes[n] = t.suffixes[prevCode] |
| 280 | t.suffixes[n][lm1] = firstByte |
| 281 | } else { |
| 282 | // Start a new suffix. |
| 283 | t.prefixes[n] = uint16(prevCode) |
| 284 | t.suffixes[n][0] = firstByte |
| 285 | for j := 1; j < Q; j++ { |
| 286 | // Unnecessary for correct output, but makes the printed table prettier. |
| 287 | t.suffixes[n][j] = '.' |
| 288 | } |
| 289 | } |
| 290 | } |
| 291 | |
| 292 | return append(output, buffer[i:4096]...) |
| 293 | } |