blob: 0d588eec15ecd64bdb8ad272e140b50861e3e3d7 [file] [log] [blame]
Nigel Taob0c72862018-10-21 16:03:47 +11001// Copyright 2018 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// +build ignore
16
17package main
18
19// print-lzw-example.go prints a worked example discussed in std/lzw/README.md,
20// based on three implementations of a simplified core of the LZW algorithm.
21// Simplifications include assuming LSB-first, 8 bit literals, no clear codes,
22// no full tables and that the input is given in terms of a code stream, not a
23// bit stream.
24//
25// The implementations are designed for ease of studying, and for e.g. a
26// minimal diff between the suf1 and sufQ implementations, and are not
27// optimized for performance.
28//
29// Usage: go run print-lzw-example.go
30//
31// You can also play with the code (e.g. modify Q and re-run) online:
32// https://play.golang.org/p/1aLC_XHZzoA
33
34import (
35 "fmt"
Nigel Taod587c9e2020-03-29 22:10:49 +110036 "os"
Nigel Taob0c72862018-10-21 16:03:47 +110037)
38
39const (
40 expectedOutput = "TOBEORNOTTOBEORTOBEORNOTXOTXOTXOOTXOOOTXOOOTOBEY"
41
42 clearCode = 0x100
43 endCode = 0x101
44
45 // Neither clearCode or endCode can ever be a valid prefix code. We
46 // arbitrarily pick clearCode to represent "no prefix".
47 noPrefix = clearCode
48
49 // Q is the maximum possible suffix length, in bytes, which equals 1 plus
50 // the maximum possible "suffix length minus 1". Q == 1 means that sufQ is
51 // equivalent to suf1.
52 //
53 // In std/lzw/README.md, Q is 3 to keep the example short and simple. In
54 // practice, especially on modern CPUs that can do unaligned 32 or 64 bit
55 // loads and stores, Q is 4 or 8.
56 Q = 3
57)
58
59var codes = []int{
60 'T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T',
61 0x102, 0x104, 0x106, 0x10B, 0x105, 0x107, 0x109,
62 'X',
63 0x111, 0x113, 0x114, 0x115, 0x10E,
64 'Y',
65 0x101,
66}
67
68func codeString(code int) string {
69 if code < clearCode {
70 return fmt.Sprintf("%5c", code)
71 } else if code == noPrefix {
72 return " -"
73 }
74 return fmt.Sprintf("0x%03X", code)
75}
76
77func main() {
Nigel Taod587c9e2020-03-29 22:10:49 +110078 if err := main1(); err != nil {
79 os.Stderr.WriteString(err.Error() + "\n")
80 os.Exit(1)
81 }
82}
83
84func main1() error {
Nigel Taob0c72862018-10-21 16:03:47 +110085 n, s, q := new(naive), new(suf1), new(sufQ)
86 decode(n)
87 decode(s)
88 decode(q)
89
90 key := endCode
91 fmt.Printf(" Code Emits Key Value Pre1+Suf1 LM1 /Q %%Q PreQ+SufQ\n")
92 for _, code := range codes {
93 // Code
94 fmt.Printf("%s", codeString(code))
95
96 // Emits
97 if code == endCode {
98 fmt.Printf(" -end-\n")
99 break
100 }
101 fmt.Printf(" %6s", n[code])
102
103 if key != endCode {
104 fmt.Printf(" 0x%03X %7s %s %c %3d %3d %3d %s %s",
105 key, // Key
106 n[key], // Value
107 codeString(int(s.prefixes[key])), // Pre1
108 s.suffixes[key], // Suf1
109 q.lengthM1s[key], // LM1
110 q.lengthM1s[key]/Q, // /Q
111 q.lengthM1s[key]%Q, // %Q
112 codeString(int(q.prefixes[key])), // PreQ
113 string(q.suffixes[key][:]), // SufQ
114 )
115 }
116
117 fmt.Println()
118 key++
119 }
Nigel Taod587c9e2020-03-29 22:10:49 +1100120 return nil
Nigel Taob0c72862018-10-21 16:03:47 +1100121}
122
123type implementation interface {
124 initialize()
125 emit(buffer []byte, code int, prevCode int, n int) []byte
126}
127
128func decode(impl implementation) {
129 impl.initialize()
130
131 output, prevCode, n := []byte(nil), -1, 0x101
132 for _, code := range codes {
133 if code == clearCode {
134 panic("unimplemented clearCode")
135 } else if code == endCode {
136 if string(output) != expectedOutput {
137 panic("unexpected output")
138 }
139 return
140 } else if code > n {
141 panic("invalid code")
142 }
143
144 // We have a literal or copy code.
145 output = impl.emit(output, code, prevCode, n)
146
147 prevCode, n = code, n+1
148 if n >= 4096 {
149 panic("unimplemented table-is-full")
150 }
151 }
152 panic("missing endCode")
153}
154
155// naive is a simple implementation that, in the worst case, requires O(N*N)
156// memory.
157type naive [4096]string
158
159func (t *naive) initialize() {
160 for i := 0; i < clearCode; i++ {
161 t[i] = string([]byte{byte(i)})
162 }
163}
164
165func (t *naive) emit(output []byte, code int, prevCode int, n int) []byte {
166 if prevCode >= 0 {
167 prevOutput := t[prevCode]
168 if code < n {
169 t[n] = prevOutput + t[code][:1]
170 } else {
171 t[n] = prevOutput + prevOutput[:1]
172 }
173 }
174 return append(output, t[code]...)
175}
176
177// suf1 keeps separate prefix and suffix tables, 1 byte per suffix.
178type suf1 struct {
179 prefixes [4096]uint16
180 suffixes [4096]byte
181}
182
183func (t *suf1) initialize() {
184 for i := 0; i < clearCode; i++ {
185 t.prefixes[i] = noPrefix
186 t.suffixes[i] = byte(i)
187 }
188}
189
190func (t *suf1) emit(output []byte, code int, prevCode int, n int) []byte {
191 // Fill an intermediate buffer from back to front, 1 byte at a time.
192 buffer := [4096]byte{}
193 i := len(buffer)
194
195 c := code
196 if code == n {
197 c = prevCode
198 i--
199 // buffer[4095] will be set later.
200 }
201
202 firstByte := byte(0)
203 for {
204 suffix := t.suffixes[c]
205 i--
206 buffer[i] = suffix
207 c = int(t.prefixes[c])
208 if c == noPrefix {
209 firstByte = suffix
210 break
211 }
212 }
213
214 if code == n {
215 buffer[4095] = firstByte
216 }
217
218 // The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of
219 // 0, is correct conceptually, but unnecessary in practice. Look for "fake
220 // key-value pair" in std/lzw/README.md.
221 if prevCode >= 0 {
222 t.prefixes[n] = uint16(prevCode)
223 t.suffixes[n] = firstByte
224 }
225
226 return append(output, buffer[i:4096]...)
227}
228
229// sufQ keeps separate prefix and suffix tables, up to Q bytes per suffix.
230type sufQ struct {
231 lengthM1s [4096]uint16
232 prefixes [4096]uint16
233 suffixes [4096][Q]byte
234}
235
236func (t *sufQ) initialize() {
237 for i := 0; i < clearCode; i++ {
238 t.lengthM1s[i] = 0
239 t.prefixes[i] = noPrefix
240 t.suffixes[i][0] = byte(i)
241 for j := 1; j < Q; j++ {
242 // Unnecessary for correct output, but makes the printed table prettier.
243 t.suffixes[i][j] = '.'
244 }
245 }
246}
247
248func (t *sufQ) emit(output []byte, code int, prevCode int, n int) []byte {
249 // Fill an intermediate buffer from back to front, Q bytes at a time.
250 buffer := [4096 + Q - 1]byte{}
251 i := len(buffer)
252
253 c := code
254 if code == n {
255 c = prevCode
256 i--
257 // buffer[4095] will be set later.
258 }
259
260 i -= int(t.lengthM1s[c] % Q)
261
262 firstByte := byte(0)
263 for {
264 suffix := t.suffixes[c]
265 i -= Q
266 copy(buffer[i:i+Q], suffix[:])
267 c = int(t.prefixes[c])
268 if c == noPrefix {
269 firstByte = suffix[0]
270 break
271 }
272 }
273
274 if code == n {
275 buffer[4095] = firstByte
276 }
277
278 // The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of
279 // 0, is correct conceptually, but unnecessary in practice. Look for "fake
280 // key-value pair" in std/lzw/README.md.
281 if prevCode >= 0 {
282 lm1 := t.lengthM1s[prevCode] + 1
283 t.lengthM1s[n] = lm1
284 lm1 %= Q
285 if lm1 != 0 {
286 // Copy the prevCode's prefix and suffix, and then extend the suffix.
287 t.prefixes[n] = t.prefixes[prevCode]
288 t.suffixes[n] = t.suffixes[prevCode]
289 t.suffixes[n][lm1] = firstByte
290 } else {
291 // Start a new suffix.
292 t.prefixes[n] = uint16(prevCode)
293 t.suffixes[n][0] = firstByte
294 for j := 1; j < Q; j++ {
295 // Unnecessary for correct output, but makes the printed table prettier.
296 t.suffixes[n][j] = '.'
297 }
298 }
299 }
300
301 return append(output, buffer[i:4096]...)
302}