blob: ab09d6a5b589691241ff81c7edd500062b3541cd [file] [log] [blame]
Nigel Taob0c72862018-10-21 16:03:47 +11001// Copyright 2018 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// +build ignore
16
17package main
18
19// print-lzw-example.go prints a worked example discussed in std/lzw/README.md,
20// based on three implementations of a simplified core of the LZW algorithm.
21// Simplifications include assuming LSB-first, 8 bit literals, no clear codes,
22// no full tables and that the input is given in terms of a code stream, not a
23// bit stream.
24//
25// The implementations are designed for ease of studying, and for e.g. a
26// minimal diff between the suf1 and sufQ implementations, and are not
27// optimized for performance.
28//
29// Usage: go run print-lzw-example.go
30//
31// You can also play with the code (e.g. modify Q and re-run) online:
32// https://play.golang.org/p/1aLC_XHZzoA
33
34import (
35 "fmt"
36)
37
38const (
39 expectedOutput = "TOBEORNOTTOBEORTOBEORNOTXOTXOTXOOTXOOOTXOOOTOBEY"
40
41 clearCode = 0x100
42 endCode = 0x101
43
44 // Neither clearCode or endCode can ever be a valid prefix code. We
45 // arbitrarily pick clearCode to represent "no prefix".
46 noPrefix = clearCode
47
48 // Q is the maximum possible suffix length, in bytes, which equals 1 plus
49 // the maximum possible "suffix length minus 1". Q == 1 means that sufQ is
50 // equivalent to suf1.
51 //
52 // In std/lzw/README.md, Q is 3 to keep the example short and simple. In
53 // practice, especially on modern CPUs that can do unaligned 32 or 64 bit
54 // loads and stores, Q is 4 or 8.
55 Q = 3
56)
57
58var codes = []int{
59 'T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T',
60 0x102, 0x104, 0x106, 0x10B, 0x105, 0x107, 0x109,
61 'X',
62 0x111, 0x113, 0x114, 0x115, 0x10E,
63 'Y',
64 0x101,
65}
66
67func codeString(code int) string {
68 if code < clearCode {
69 return fmt.Sprintf("%5c", code)
70 } else if code == noPrefix {
71 return " -"
72 }
73 return fmt.Sprintf("0x%03X", code)
74}
75
76func main() {
77 n, s, q := new(naive), new(suf1), new(sufQ)
78 decode(n)
79 decode(s)
80 decode(q)
81
82 key := endCode
83 fmt.Printf(" Code Emits Key Value Pre1+Suf1 LM1 /Q %%Q PreQ+SufQ\n")
84 for _, code := range codes {
85 // Code
86 fmt.Printf("%s", codeString(code))
87
88 // Emits
89 if code == endCode {
90 fmt.Printf(" -end-\n")
91 break
92 }
93 fmt.Printf(" %6s", n[code])
94
95 if key != endCode {
96 fmt.Printf(" 0x%03X %7s %s %c %3d %3d %3d %s %s",
97 key, // Key
98 n[key], // Value
99 codeString(int(s.prefixes[key])), // Pre1
100 s.suffixes[key], // Suf1
101 q.lengthM1s[key], // LM1
102 q.lengthM1s[key]/Q, // /Q
103 q.lengthM1s[key]%Q, // %Q
104 codeString(int(q.prefixes[key])), // PreQ
105 string(q.suffixes[key][:]), // SufQ
106 )
107 }
108
109 fmt.Println()
110 key++
111 }
112}
113
114type implementation interface {
115 initialize()
116 emit(buffer []byte, code int, prevCode int, n int) []byte
117}
118
119func decode(impl implementation) {
120 impl.initialize()
121
122 output, prevCode, n := []byte(nil), -1, 0x101
123 for _, code := range codes {
124 if code == clearCode {
125 panic("unimplemented clearCode")
126 } else if code == endCode {
127 if string(output) != expectedOutput {
128 panic("unexpected output")
129 }
130 return
131 } else if code > n {
132 panic("invalid code")
133 }
134
135 // We have a literal or copy code.
136 output = impl.emit(output, code, prevCode, n)
137
138 prevCode, n = code, n+1
139 if n >= 4096 {
140 panic("unimplemented table-is-full")
141 }
142 }
143 panic("missing endCode")
144}
145
146// naive is a simple implementation that, in the worst case, requires O(N*N)
147// memory.
148type naive [4096]string
149
150func (t *naive) initialize() {
151 for i := 0; i < clearCode; i++ {
152 t[i] = string([]byte{byte(i)})
153 }
154}
155
156func (t *naive) emit(output []byte, code int, prevCode int, n int) []byte {
157 if prevCode >= 0 {
158 prevOutput := t[prevCode]
159 if code < n {
160 t[n] = prevOutput + t[code][:1]
161 } else {
162 t[n] = prevOutput + prevOutput[:1]
163 }
164 }
165 return append(output, t[code]...)
166}
167
168// suf1 keeps separate prefix and suffix tables, 1 byte per suffix.
169type suf1 struct {
170 prefixes [4096]uint16
171 suffixes [4096]byte
172}
173
174func (t *suf1) initialize() {
175 for i := 0; i < clearCode; i++ {
176 t.prefixes[i] = noPrefix
177 t.suffixes[i] = byte(i)
178 }
179}
180
181func (t *suf1) emit(output []byte, code int, prevCode int, n int) []byte {
182 // Fill an intermediate buffer from back to front, 1 byte at a time.
183 buffer := [4096]byte{}
184 i := len(buffer)
185
186 c := code
187 if code == n {
188 c = prevCode
189 i--
190 // buffer[4095] will be set later.
191 }
192
193 firstByte := byte(0)
194 for {
195 suffix := t.suffixes[c]
196 i--
197 buffer[i] = suffix
198 c = int(t.prefixes[c])
199 if c == noPrefix {
200 firstByte = suffix
201 break
202 }
203 }
204
205 if code == n {
206 buffer[4095] = firstByte
207 }
208
209 // The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of
210 // 0, is correct conceptually, but unnecessary in practice. Look for "fake
211 // key-value pair" in std/lzw/README.md.
212 if prevCode >= 0 {
213 t.prefixes[n] = uint16(prevCode)
214 t.suffixes[n] = firstByte
215 }
216
217 return append(output, buffer[i:4096]...)
218}
219
220// sufQ keeps separate prefix and suffix tables, up to Q bytes per suffix.
221type sufQ struct {
222 lengthM1s [4096]uint16
223 prefixes [4096]uint16
224 suffixes [4096][Q]byte
225}
226
227func (t *sufQ) initialize() {
228 for i := 0; i < clearCode; i++ {
229 t.lengthM1s[i] = 0
230 t.prefixes[i] = noPrefix
231 t.suffixes[i][0] = byte(i)
232 for j := 1; j < Q; j++ {
233 // Unnecessary for correct output, but makes the printed table prettier.
234 t.suffixes[i][j] = '.'
235 }
236 }
237}
238
239func (t *sufQ) emit(output []byte, code int, prevCode int, n int) []byte {
240 // Fill an intermediate buffer from back to front, Q bytes at a time.
241 buffer := [4096 + Q - 1]byte{}
242 i := len(buffer)
243
244 c := code
245 if code == n {
246 c = prevCode
247 i--
248 // buffer[4095] will be set later.
249 }
250
251 i -= int(t.lengthM1s[c] % Q)
252
253 firstByte := byte(0)
254 for {
255 suffix := t.suffixes[c]
256 i -= Q
257 copy(buffer[i:i+Q], suffix[:])
258 c = int(t.prefixes[c])
259 if c == noPrefix {
260 firstByte = suffix[0]
261 break
262 }
263 }
264
265 if code == n {
266 buffer[4095] = firstByte
267 }
268
269 // The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of
270 // 0, is correct conceptually, but unnecessary in practice. Look for "fake
271 // key-value pair" in std/lzw/README.md.
272 if prevCode >= 0 {
273 lm1 := t.lengthM1s[prevCode] + 1
274 t.lengthM1s[n] = lm1
275 lm1 %= Q
276 if lm1 != 0 {
277 // Copy the prevCode's prefix and suffix, and then extend the suffix.
278 t.prefixes[n] = t.prefixes[prevCode]
279 t.suffixes[n] = t.suffixes[prevCode]
280 t.suffixes[n][lm1] = firstByte
281 } else {
282 // Start a new suffix.
283 t.prefixes[n] = uint16(prevCode)
284 t.suffixes[n][0] = firstByte
285 for j := 1; j < Q; j++ {
286 // Unnecessary for correct output, but makes the printed table prettier.
287 t.suffixes[n][j] = '.'
288 }
289 }
290 }
291
292 return append(output, buffer[i:4096]...)
293}