source: trunk/minix/lib/zlib-1.2.3/contrib/masmx86/gvmat32.asm@ 9

Last change on this file since 9 was 9, checked in by Mattia Monga, 13 years ago

Minix 3.1.2a

File size: 26.5 KB
Line 
1; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86
2; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant.
3; File written by Gilles Vollant, by modifiying the longest_match
4; from Jean-loup Gailly in deflate.c
5;
6; http://www.zlib.net
7; http://www.winimage.com/zLibDll
8; http://www.muppetlabs.com/~breadbox/software/assembly.html
9;
10; For Visual C++ 4.x and higher and ML 6.x and higher
11; ml.exe is in directory \MASM611C of Win95 DDK
12; ml.exe is also distributed in http://www.masm32.com/masmdl.htm
13; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
14;
15; this file contain two implementation of longest_match
16;
17; longest_match_7fff : written 1996 by Gilles Vollant optimized for
18; first Pentium. Assume s->w_mask == 0x7fff
19; longest_match_686 : written by Brian raiter (1998), optimized for Pentium Pro
20;
21; for using an seembly version of longest_match, you need define ASMV in project
22; There is two way in using gvmat32.asm
23;
24; A) Suggested method
25; if you want include both longest_match_7fff and longest_match_686
26; compile the asm file running
27; ml /coff /Zi /Flgvmat32.lst /c gvmat32.asm
28; and include gvmat32c.c in your project
29; if you have an old cpu (386,486 or first Pentium) and s->w_mask==0x7fff,
30; longest_match_7fff will be used
31; if you have a more modern CPU (Pentium Pro, II and higher)
32; longest_match_686 will be used
33; on old cpu with s->w_mask!=0x7fff, longest_match_686 will be used,
34; but this is not a sitation you'll find often
35;
36; B) Alternative
37; if you are not interresed in old cpu performance and want the smaller
38; binaries possible
39;
40; compile the asm file running
41; ml /coff /Zi /c /Flgvmat32.lst /DNOOLDPENTIUMCODE gvmat32.asm
42; and do not include gvmat32c.c in your project (ou define also
43; NOOLDPENTIUMCODE)
44;
45; note : as I known, longest_match_686 is very faster than longest_match_7fff
46; on pentium Pro/II/III, faster (but less) in P4, but it seem
47; longest_match_7fff can be faster (very very litte) on AMD Athlon64/K8
48;
49; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
50
51;uInt longest_match_7fff(s, cur_match)
52; deflate_state *s;
53; IPos cur_match; /* current match */
54
55 NbStack equ 76
56 cur_match equ dword ptr[esp+NbStack-0]
57 str_s equ dword ptr[esp+NbStack-4]
58; 5 dword on top (ret,ebp,esi,edi,ebx)
59 adrret equ dword ptr[esp+NbStack-8]
60 pushebp equ dword ptr[esp+NbStack-12]
61 pushedi equ dword ptr[esp+NbStack-16]
62 pushesi equ dword ptr[esp+NbStack-20]
63 pushebx equ dword ptr[esp+NbStack-24]
64
65 chain_length equ dword ptr [esp+NbStack-28]
66 limit equ dword ptr [esp+NbStack-32]
67 best_len equ dword ptr [esp+NbStack-36]
68 window equ dword ptr [esp+NbStack-40]
69 prev equ dword ptr [esp+NbStack-44]
70 scan_start equ word ptr [esp+NbStack-48]
71 wmask equ dword ptr [esp+NbStack-52]
72 match_start_ptr equ dword ptr [esp+NbStack-56]
73 nice_match equ dword ptr [esp+NbStack-60]
74 scan equ dword ptr [esp+NbStack-64]
75
76 windowlen equ dword ptr [esp+NbStack-68]
77 match_start equ dword ptr [esp+NbStack-72]
78 strend equ dword ptr [esp+NbStack-76]
79 NbStackAdd equ (NbStack-24)
80
81 .386p
82
83 name gvmatch
84 .MODEL FLAT
85
86
87
88; all the +zlib1222add offsets are due to the addition of fields
89; in zlib in the deflate_state structure since the asm code was first written
90; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
91; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
92; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
93
94 zlib1222add equ 8
95
96; Note : these value are good with a 8 bytes boundary pack structure
97 dep_chain_length equ 74h+zlib1222add
98 dep_window equ 30h+zlib1222add
99 dep_strstart equ 64h+zlib1222add
100 dep_prev_length equ 70h+zlib1222add
101 dep_nice_match equ 88h+zlib1222add
102 dep_w_size equ 24h+zlib1222add
103 dep_prev equ 38h+zlib1222add
104 dep_w_mask equ 2ch+zlib1222add
105 dep_good_match equ 84h+zlib1222add
106 dep_match_start equ 68h+zlib1222add
107 dep_lookahead equ 6ch+zlib1222add
108
109
110_TEXT segment
111
112IFDEF NOUNDERLINE
113 IFDEF NOOLDPENTIUMCODE
114 public longest_match
115 public match_init
116 ELSE
117 public longest_match_7fff
118 public cpudetect32
119 public longest_match_686
120 ENDIF
121ELSE
122 IFDEF NOOLDPENTIUMCODE
123 public _longest_match
124 public _match_init
125 ELSE
126 public _longest_match_7fff
127 public _cpudetect32
128 public _longest_match_686
129 ENDIF
130ENDIF
131
132 MAX_MATCH equ 258
133 MIN_MATCH equ 3
134 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
135
136
137
138IFNDEF NOOLDPENTIUMCODE
139IFDEF NOUNDERLINE
140longest_match_7fff proc near
141ELSE
142_longest_match_7fff proc near
143ENDIF
144
145 mov edx,[esp+4]
146
147
148
149 push ebp
150 push edi
151 push esi
152 push ebx
153
154 sub esp,NbStackAdd
155
156; initialize or check the variables used in match.asm.
157 mov ebp,edx
158
159; chain_length = s->max_chain_length
160; if (prev_length>=good_match) chain_length >>= 2
161 mov edx,[ebp+dep_chain_length]
162 mov ebx,[ebp+dep_prev_length]
163 cmp [ebp+dep_good_match],ebx
164 ja noshr
165 shr edx,2
166noshr:
167; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop
168 inc edx
169 mov edi,[ebp+dep_nice_match]
170 mov chain_length,edx
171 mov eax,[ebp+dep_lookahead]
172 cmp eax,edi
173; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
174 jae nolookaheadnicematch
175 mov edi,eax
176nolookaheadnicematch:
177; best_len = s->prev_length
178 mov best_len,ebx
179
180; window = s->window
181 mov esi,[ebp+dep_window]
182 mov ecx,[ebp+dep_strstart]
183 mov window,esi
184
185 mov nice_match,edi
186; scan = window + strstart
187 add esi,ecx
188 mov scan,esi
189; dx = *window
190 mov dx,word ptr [esi]
191; bx = *(window+best_len-1)
192 mov bx,word ptr [esi+ebx-1]
193 add esi,MAX_MATCH-1
194; scan_start = *scan
195 mov scan_start,dx
196; strend = scan + MAX_MATCH-1
197 mov strend,esi
198; bx = scan_end = *(window+best_len-1)
199
200; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
201; s->strstart - (IPos)MAX_DIST(s) : NIL;
202
203 mov esi,[ebp+dep_w_size]
204 sub esi,MIN_LOOKAHEAD
205; here esi = MAX_DIST(s)
206 sub ecx,esi
207 ja nodist
208 xor ecx,ecx
209nodist:
210 mov limit,ecx
211
212; prev = s->prev
213 mov edx,[ebp+dep_prev]
214 mov prev,edx
215
216;
217 mov edx,dword ptr [ebp+dep_match_start]
218 mov bp,scan_start
219 mov eax,cur_match
220 mov match_start,edx
221
222 mov edx,window
223 mov edi,edx
224 add edi,best_len
225 mov esi,prev
226 dec edi
227; windowlen = window + best_len -1
228 mov windowlen,edi
229
230 jmp beginloop2
231 align 4
232
233; here, in the loop
234; eax = ax = cur_match
235; ecx = limit
236; bx = scan_end
237; bp = scan_start
238; edi = windowlen (window + best_len -1)
239; esi = prev
240
241
242;// here; chain_length <=16
243normalbeg0add16:
244 add chain_length,16
245 jz exitloop
246normalbeg0:
247 cmp word ptr[edi+eax],bx
248 je normalbeg2noroll
249rcontlabnoroll:
250; cur_match = prev[cur_match & wmask]
251 and eax,7fffh
252 mov ax,word ptr[esi+eax*2]
253; if cur_match > limit, go to exitloop
254 cmp ecx,eax
255 jnb exitloop
256; if --chain_length != 0, go to exitloop
257 dec chain_length
258 jnz normalbeg0
259 jmp exitloop
260
261normalbeg2noroll:
262; if (scan_start==*(cur_match+window)) goto normalbeg2
263 cmp bp,word ptr[edx+eax]
264 jne rcontlabnoroll
265 jmp normalbeg2
266
267contloop3:
268 mov edi,windowlen
269
270; cur_match = prev[cur_match & wmask]
271 and eax,7fffh
272 mov ax,word ptr[esi+eax*2]
273; if cur_match > limit, go to exitloop
274 cmp ecx,eax
275jnbexitloopshort1:
276 jnb exitloop
277; if --chain_length != 0, go to exitloop
278
279
280; begin the main loop
281beginloop2:
282 sub chain_length,16+1
283; if chain_length <=16, don't use the unrolled loop
284 jna normalbeg0add16
285
286do16:
287 cmp word ptr[edi+eax],bx
288 je normalbeg2dc0
289
290maccn MACRO lab
291 and eax,7fffh
292 mov ax,word ptr[esi+eax*2]
293 cmp ecx,eax
294 jnb exitloop
295 cmp word ptr[edi+eax],bx
296 je lab
297 ENDM
298
299rcontloop0:
300 maccn normalbeg2dc1
301
302rcontloop1:
303 maccn normalbeg2dc2
304
305rcontloop2:
306 maccn normalbeg2dc3
307
308rcontloop3:
309 maccn normalbeg2dc4
310
311rcontloop4:
312 maccn normalbeg2dc5
313
314rcontloop5:
315 maccn normalbeg2dc6
316
317rcontloop6:
318 maccn normalbeg2dc7
319
320rcontloop7:
321 maccn normalbeg2dc8
322
323rcontloop8:
324 maccn normalbeg2dc9
325
326rcontloop9:
327 maccn normalbeg2dc10
328
329rcontloop10:
330 maccn short normalbeg2dc11
331
332rcontloop11:
333 maccn short normalbeg2dc12
334
335rcontloop12:
336 maccn short normalbeg2dc13
337
338rcontloop13:
339 maccn short normalbeg2dc14
340
341rcontloop14:
342 maccn short normalbeg2dc15
343
344rcontloop15:
345 and eax,7fffh
346 mov ax,word ptr[esi+eax*2]
347 cmp ecx,eax
348 jnb exitloop
349
350 sub chain_length,16
351 ja do16
352 jmp normalbeg0add16
353
354;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
355
356normbeg MACRO rcontlab,valsub
357; if we are here, we know that *(match+best_len-1) == scan_end
358 cmp bp,word ptr[edx+eax]
359; if (match != scan_start) goto rcontlab
360 jne rcontlab
361; calculate the good chain_length, and we'll compare scan and match string
362 add chain_length,16-valsub
363 jmp iseq
364 ENDM
365
366
367normalbeg2dc11:
368 normbeg rcontloop11,11
369
370normalbeg2dc12:
371 normbeg short rcontloop12,12
372
373normalbeg2dc13:
374 normbeg short rcontloop13,13
375
376normalbeg2dc14:
377 normbeg short rcontloop14,14
378
379normalbeg2dc15:
380 normbeg short rcontloop15,15
381
382normalbeg2dc10:
383 normbeg rcontloop10,10
384
385normalbeg2dc9:
386 normbeg rcontloop9,9
387
388normalbeg2dc8:
389 normbeg rcontloop8,8
390
391normalbeg2dc7:
392 normbeg rcontloop7,7
393
394normalbeg2dc6:
395 normbeg rcontloop6,6
396
397normalbeg2dc5:
398 normbeg rcontloop5,5
399
400normalbeg2dc4:
401 normbeg rcontloop4,4
402
403normalbeg2dc3:
404 normbeg rcontloop3,3
405
406normalbeg2dc2:
407 normbeg rcontloop2,2
408
409normalbeg2dc1:
410 normbeg rcontloop1,1
411
412normalbeg2dc0:
413 normbeg rcontloop0,0
414
415
416; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end
417
418normalbeg2:
419 mov edi,window
420
421 cmp bp,word ptr[edi+eax]
422 jne contloop3 ; if *(ushf*)match != scan_start, continue
423
424iseq:
425; if we are here, we know that *(match+best_len-1) == scan_end
426; and (match == scan_start)
427
428 mov edi,edx
429 mov esi,scan ; esi = scan
430 add edi,eax ; edi = window + cur_match = match
431
432 mov edx,[esi+3] ; compare manually dword at match+3
433 xor edx,[edi+3] ; and scan +3
434
435 jz begincompare ; if equal, go to long compare
436
437; we will determine the unmatch byte and calculate len (in esi)
438 or dl,dl
439 je eq1rr
440 mov esi,3
441 jmp trfinval
442eq1rr:
443 or dx,dx
444 je eq1
445
446 mov esi,4
447 jmp trfinval
448eq1:
449 and edx,0ffffffh
450 jz eq11
451 mov esi,5
452 jmp trfinval
453eq11:
454 mov esi,6
455 jmp trfinval
456
457begincompare:
458 ; here we now scan and match begin same
459 add edi,6
460 add esi,6
461 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes
462 repe cmpsd ; loop until mismatch
463
464 je trfin ; go to trfin if not unmatch
465; we determine the unmatch byte
466 sub esi,4
467 mov edx,[edi-4]
468 xor edx,[esi]
469
470 or dl,dl
471 jnz trfin
472 inc esi
473
474 or dx,dx
475 jnz trfin
476 inc esi
477
478 and edx,0ffffffh
479 jnz trfin
480 inc esi
481
482trfin:
483 sub esi,scan ; esi = len
484trfinval:
485; here we have finised compare, and esi contain len of equal string
486 cmp esi,best_len ; if len > best_len, go newbestlen
487 ja short newbestlen
488; now we restore edx, ecx and esi, for the big loop
489 mov esi,prev
490 mov ecx,limit
491 mov edx,window
492 jmp contloop3
493
494newbestlen:
495 mov best_len,esi ; len become best_len
496
497 mov match_start,eax ; save new position as match_start
498 cmp esi,nice_match ; if best_len >= nice_match, exit
499 jae exitloop
500 mov ecx,scan
501 mov edx,window ; restore edx=window
502 add ecx,esi
503 add esi,edx
504
505 dec esi
506 mov windowlen,esi ; windowlen = window + best_len-1
507 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end
508
509; now we restore ecx and esi, for the big loop :
510 mov esi,prev
511 mov ecx,limit
512 jmp contloop3
513
514exitloop:
515; exit : s->match_start=match_start
516 mov ebx,match_start
517 mov ebp,str_s
518 mov ecx,best_len
519 mov dword ptr [ebp+dep_match_start],ebx
520 mov eax,dword ptr [ebp+dep_lookahead]
521 cmp ecx,eax
522 ja minexlo
523 mov eax,ecx
524minexlo:
525; return min(best_len,s->lookahead)
526
527; restore stack and register ebx,esi,edi,ebp
528 add esp,NbStackAdd
529
530 pop ebx
531 pop esi
532 pop edi
533 pop ebp
534 ret
535InfoAuthor:
536; please don't remove this string !
537; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary!
538 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah
539
540
541
542IFDEF NOUNDERLINE
543longest_match_7fff endp
544ELSE
545_longest_match_7fff endp
546ENDIF
547
548
549IFDEF NOUNDERLINE
550cpudetect32 proc near
551ELSE
552_cpudetect32 proc near
553ENDIF
554
555 push ebx
556
557 pushfd ; push original EFLAGS
558 pop eax ; get original EFLAGS
559 mov ecx, eax ; save original EFLAGS
560 xor eax, 40000h ; flip AC bit in EFLAGS
561 push eax ; save new EFLAGS value on stack
562 popfd ; replace current EFLAGS value
563 pushfd ; get new EFLAGS
564 pop eax ; store new EFLAGS in EAX
565 xor eax, ecx ; can’t toggle AC bit, processor=80386
566 jz end_cpu_is_386 ; jump if 80386 processor
567 push ecx
568 popfd ; restore AC bit in EFLAGS first
569
570 pushfd
571 pushfd
572 pop ecx
573
574 mov eax, ecx ; get original EFLAGS
575 xor eax, 200000h ; flip ID bit in EFLAGS
576 push eax ; save new EFLAGS value on stack
577 popfd ; replace current EFLAGS value
578 pushfd ; get new EFLAGS
579 pop eax ; store new EFLAGS in EAX
580 popfd ; restore original EFLAGS
581 xor eax, ecx ; can’t toggle ID bit,
582 je is_old_486 ; processor=old
583
584 mov eax,1
585 db 0fh,0a2h ;CPUID
586
587exitcpudetect:
588 pop ebx
589 ret
590
591end_cpu_is_386:
592 mov eax,0300h
593 jmp exitcpudetect
594
595is_old_486:
596 mov eax,0400h
597 jmp exitcpudetect
598
599IFDEF NOUNDERLINE
600cpudetect32 endp
601ELSE
602_cpudetect32 endp
603ENDIF
604ENDIF
605
606MAX_MATCH equ 258
607MIN_MATCH equ 3
608MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
609MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
610
611
612;;; stack frame offsets
613
614chainlenwmask equ esp + 0 ; high word: current chain len
615 ; low word: s->wmask
616window equ esp + 4 ; local copy of s->window
617windowbestlen equ esp + 8 ; s->window + bestlen
618scanstart equ esp + 16 ; first two bytes of string
619scanend equ esp + 12 ; last two bytes of string
620scanalign equ esp + 20 ; dword-misalignment of string
621nicematch equ esp + 24 ; a good enough match size
622bestlen equ esp + 28 ; size of best match so far
623scan equ esp + 32 ; ptr to string wanting match
624
625LocalVarsSize equ 36
626; saved ebx byte esp + 36
627; saved edi byte esp + 40
628; saved esi byte esp + 44
629; saved ebp byte esp + 48
630; return address byte esp + 52
631deflatestate equ esp + 56 ; the function arguments
632curmatch equ esp + 60
633
634;;; Offsets for fields in the deflate_state structure. These numbers
635;;; are calculated from the definition of deflate_state, with the
636;;; assumption that the compiler will dword-align the fields. (Thus,
637;;; changing the definition of deflate_state could easily cause this
638;;; program to crash horribly, without so much as a warning at
639;;; compile time. Sigh.)
640
641dsWSize equ 36+zlib1222add
642dsWMask equ 44+zlib1222add
643dsWindow equ 48+zlib1222add
644dsPrev equ 56+zlib1222add
645dsMatchLen equ 88+zlib1222add
646dsPrevMatch equ 92+zlib1222add
647dsStrStart equ 100+zlib1222add
648dsMatchStart equ 104+zlib1222add
649dsLookahead equ 108+zlib1222add
650dsPrevLen equ 112+zlib1222add
651dsMaxChainLen equ 116+zlib1222add
652dsGoodMatch equ 132+zlib1222add
653dsNiceMatch equ 136+zlib1222add
654
655
656;;; match.asm -- Pentium-Pro-optimized version of longest_match()
657;;; Written for zlib 1.1.2
658;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
659;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
660;;;
661;;; This is free software; you can redistribute it and/or modify it
662;;; under the terms of the GNU General Public License.
663
664;GLOBAL _longest_match, _match_init
665
666
667;SECTION .text
668
669;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
670
671;_longest_match:
672IFDEF NOOLDPENTIUMCODE
673 IFDEF NOUNDERLINE
674 longest_match proc near
675 ELSE
676 _longest_match proc near
677 ENDIF
678ELSE
679 IFDEF NOUNDERLINE
680 longest_match_686 proc near
681 ELSE
682 _longest_match_686 proc near
683 ENDIF
684ENDIF
685
686;;; Save registers that the compiler may be using, and adjust esp to
687;;; make room for our stack frame.
688
689 push ebp
690 push edi
691 push esi
692 push ebx
693 sub esp, LocalVarsSize
694
695;;; Retrieve the function arguments. ecx will hold cur_match
696;;; throughout the entire function. edx will hold the pointer to the
697;;; deflate_state structure during the function's setup (before
698;;; entering the main loop.
699
700 mov edx, [deflatestate]
701 mov ecx, [curmatch]
702
703;;; uInt wmask = s->w_mask;
704;;; unsigned chain_length = s->max_chain_length;
705;;; if (s->prev_length >= s->good_match) {
706;;; chain_length >>= 2;
707;;; }
708
709 mov eax, [edx + dsPrevLen]
710 mov ebx, [edx + dsGoodMatch]
711 cmp eax, ebx
712 mov eax, [edx + dsWMask]
713 mov ebx, [edx + dsMaxChainLen]
714 jl LastMatchGood
715 shr ebx, 2
716LastMatchGood:
717
718;;; chainlen is decremented once beforehand so that the function can
719;;; use the sign flag instead of the zero flag for the exit test.
720;;; It is then shifted into the high word, to make room for the wmask
721;;; value, which it will always accompany.
722
723 dec ebx
724 shl ebx, 16
725 or ebx, eax
726 mov [chainlenwmask], ebx
727
728;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
729
730 mov eax, [edx + dsNiceMatch]
731 mov ebx, [edx + dsLookahead]
732 cmp ebx, eax
733 jl LookaheadLess
734 mov ebx, eax
735LookaheadLess: mov [nicematch], ebx
736
737;;; register Bytef *scan = s->window + s->strstart;
738
739 mov esi, [edx + dsWindow]
740 mov [window], esi
741 mov ebp, [edx + dsStrStart]
742 lea edi, [esi + ebp]
743 mov [scan], edi
744
745;;; Determine how many bytes the scan ptr is off from being
746;;; dword-aligned.
747
748 mov eax, edi
749 neg eax
750 and eax, 3
751 mov [scanalign], eax
752
753;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
754;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
755
756 mov eax, [edx + dsWSize]
757 sub eax, MIN_LOOKAHEAD
758 sub ebp, eax
759 jg LimitPositive
760 xor ebp, ebp
761LimitPositive:
762
763;;; int best_len = s->prev_length;
764
765 mov eax, [edx + dsPrevLen]
766 mov [bestlen], eax
767
768;;; Store the sum of s->window + best_len in esi locally, and in esi.
769
770 add esi, eax
771 mov [windowbestlen], esi
772
773;;; register ush scan_start = *(ushf*)scan;
774;;; register ush scan_end = *(ushf*)(scan+best_len-1);
775;;; Posf *prev = s->prev;
776
777 movzx ebx, word ptr [edi]
778 mov [scanstart], ebx
779 movzx ebx, word ptr [edi + eax - 1]
780 mov [scanend], ebx
781 mov edi, [edx + dsPrev]
782
783;;; Jump into the main loop.
784
785 mov edx, [chainlenwmask]
786 jmp short LoopEntry
787
788align 4
789
790;;; do {
791;;; match = s->window + cur_match;
792;;; if (*(ushf*)(match+best_len-1) != scan_end ||
793;;; *(ushf*)match != scan_start) continue;
794;;; [...]
795;;; } while ((cur_match = prev[cur_match & wmask]) > limit
796;;; && --chain_length != 0);
797;;;
798;;; Here is the inner loop of the function. The function will spend the
799;;; majority of its time in this loop, and majority of that time will
800;;; be spent in the first ten instructions.
801;;;
802;;; Within this loop:
803;;; ebx = scanend
804;;; ecx = curmatch
805;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
806;;; esi = windowbestlen - i.e., (window + bestlen)
807;;; edi = prev
808;;; ebp = limit
809
810LookupLoop:
811 and ecx, edx
812 movzx ecx, word ptr [edi + ecx*2]
813 cmp ecx, ebp
814 jbe LeaveNow
815 sub edx, 00010000h
816 js LeaveNow
817LoopEntry: movzx eax, word ptr [esi + ecx - 1]
818 cmp eax, ebx
819 jnz LookupLoop
820 mov eax, [window]
821 movzx eax, word ptr [eax + ecx]
822 cmp eax, [scanstart]
823 jnz LookupLoop
824
825;;; Store the current value of chainlen.
826
827 mov [chainlenwmask], edx
828
829;;; Point edi to the string under scrutiny, and esi to the string we
830;;; are hoping to match it up with. In actuality, esi and edi are
831;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
832;;; initialized to -(MAX_MATCH_8 - scanalign).
833
834 mov esi, [window]
835 mov edi, [scan]
836 add esi, ecx
837 mov eax, [scanalign]
838 mov edx, 0fffffef8h; -(MAX_MATCH_8)
839 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
840 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
841
842;;; Test the strings for equality, 8 bytes at a time. At the end,
843;;; adjust edx so that it is offset to the exact byte that mismatched.
844;;;
845;;; We already know at this point that the first three bytes of the
846;;; strings match each other, and they can be safely passed over before
847;;; starting the compare loop. So what this code does is skip over 0-3
848;;; bytes, as much as necessary in order to dword-align the edi
849;;; pointer. (esi will still be misaligned three times out of four.)
850;;;
851;;; It should be confessed that this loop usually does not represent
852;;; much of the total running time. Replacing it with a more
853;;; straightforward "rep cmpsb" would not drastically degrade
854;;; performance.
855
856LoopCmps:
857 mov eax, [esi + edx]
858 xor eax, [edi + edx]
859 jnz LeaveLoopCmps
860 mov eax, [esi + edx + 4]
861 xor eax, [edi + edx + 4]
862 jnz LeaveLoopCmps4
863 add edx, 8
864 jnz LoopCmps
865 jmp short LenMaximum
866LeaveLoopCmps4: add edx, 4
867LeaveLoopCmps: test eax, 0000FFFFh
868 jnz LenLower
869 add edx, 2
870 shr eax, 16
871LenLower: sub al, 1
872 adc edx, 0
873
874;;; Calculate the length of the match. If it is longer than MAX_MATCH,
875;;; then automatically accept it as the best possible match and leave.
876
877 lea eax, [edi + edx]
878 mov edi, [scan]
879 sub eax, edi
880 cmp eax, MAX_MATCH
881 jge LenMaximum
882
883;;; If the length of the match is not longer than the best match we
884;;; have so far, then forget it and return to the lookup loop.
885
886 mov edx, [deflatestate]
887 mov ebx, [bestlen]
888 cmp eax, ebx
889 jg LongerMatch
890 mov esi, [windowbestlen]
891 mov edi, [edx + dsPrev]
892 mov ebx, [scanend]
893 mov edx, [chainlenwmask]
894 jmp LookupLoop
895
896;;; s->match_start = cur_match;
897;;; best_len = len;
898;;; if (len >= nice_match) break;
899;;; scan_end = *(ushf*)(scan+best_len-1);
900
901LongerMatch: mov ebx, [nicematch]
902 mov [bestlen], eax
903 mov [edx + dsMatchStart], ecx
904 cmp eax, ebx
905 jge LeaveNow
906 mov esi, [window]
907 add esi, eax
908 mov [windowbestlen], esi
909 movzx ebx, word ptr [edi + eax - 1]
910 mov edi, [edx + dsPrev]
911 mov [scanend], ebx
912 mov edx, [chainlenwmask]
913 jmp LookupLoop
914
915;;; Accept the current string, with the maximum possible length.
916
917LenMaximum: mov edx, [deflatestate]
918 mov dword ptr [bestlen], MAX_MATCH
919 mov [edx + dsMatchStart], ecx
920
921;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
922;;; return s->lookahead;
923
924LeaveNow:
925 mov edx, [deflatestate]
926 mov ebx, [bestlen]
927 mov eax, [edx + dsLookahead]
928 cmp ebx, eax
929 jg LookaheadRet
930 mov eax, ebx
931LookaheadRet:
932
933;;; Restore the stack and return from whence we came.
934
935 add esp, LocalVarsSize
936 pop ebx
937 pop esi
938 pop edi
939 pop ebp
940
941 ret
942; please don't remove this string !
943; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary!
944 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
945
946
947IFDEF NOOLDPENTIUMCODE
948 IFDEF NOUNDERLINE
949 longest_match endp
950 ELSE
951 _longest_match endp
952 ENDIF
953
954 IFDEF NOUNDERLINE
955 match_init proc near
956 ret
957 match_init endp
958 ELSE
959 _match_init proc near
960 ret
961 _match_init endp
962 ENDIF
963ELSE
964 IFDEF NOUNDERLINE
965 longest_match_686 endp
966 ELSE
967 _longest_match_686 endp
968 ENDIF
969ENDIF
970
971_TEXT ends
972end
Note: See TracBrowser for help on using the repository browser.