Open source
  • Bandwidth benchmark
  • TouchWidgets UI lib
  • Diviner big number math
  • Documentation
  • x86 instructions ref
  • GIT quick ref
  • GPG quick ref
  • Avoid Ubuntu
  • Android malware risks
  • iOS malware risks
  • OS/X security tips
  • Who blocks Tor
  • Software engineering
  • BASH aliases
  • I.B. pro/con
  • Nutrition
  • Other apps
  • Blog
  • Contact
    1 at zsmith dot co

    Fast Integer Square Root on Intel x86

    © by
    All rights reserved.

    Back in 2011, I was working on a compiler experiement that I called C@. As part of that project I decided to have some fun and create a fast integer square root. Below is that code. It's a little bit clunky and could be further optimized, but it is fast.

    The inner routine

    find_root:
    _find_root:
    	; EAX = value
    	; EBX = starting of range of roots to try
    	; ECX = ending of range of roots to try
    	cmp ebx, ecx	; if start==end, return start
    	je .L0
    	inc ebx
    	cmp ebx, ecx	; if start==end-1, return start
    	je .L1
    	dec ebx
    	mov eax, ebx	; midpoint = (start+end)/2
    	add eax, ecx
    	shr eax, 1
    	mov esi, eax	; put midpoint in esi
    	mul eax		; m = midpoint^2 
    	or edx, edx	; if m >= to (2^32) do lower range...
    	jnz .L4
    	cmp ebp, eax	; if value >= m do upper range
    	jae .L3		; if value < m do lower range
    .L4: 	mov ecx, esi
    	jmp find_root	; continue into lower range
    .L3: 	mov ebx, esi
    	jmp find_root	; continue into upper range
    .L1: 	dec ebx
    .L0: 	mov eax, ebx
    	ret
    

    The outer routine

    builtin_sqrt:
    _builtin_sqrt:
    	; Builtin square root by Zack T Smith.
    	; Using a divide-and-conquer approach.
    	; EAX = value
    	or eax, eax	; if (!value) return 0;
    	jz .L1
    	cmp eax, 4	; if (value less than 4) return 1;
    	jae .L2
    	mov eax, 1
    	ret
    .L2:	push ebx
    	push ecx
    	push edx
    	push esi
    	push ebp
    	mov ebp, eax
    	bsr ecx, eax	; get # bits used in value
    	inc ecx
    	sar ecx, 1	; nbits /= 2
    	dec ecx  	; start = 1 << (nbits - 1)
    	mov ebx, 1	
    	sal ebx, cl
    	add ecx, 2	; end = 1 << (nbits + 1)
    	mov edx, 1
    	sal edx, cl
    	mov ecx, edx
    	call find_root	; find the root
    	pop ebp
    	pop esi
    	pop edx
    	pop ecx
    	pop ebx
    .L1:	ret
    




    © Zack Smith