You can make python binary search super fast if you use mmap. here's a version of that I had lying around, it's probably correct.
import os
import mmap
def do_mmap(f):
fd = os.open(f, os.O_RDONLY)
size = os.lseek(fd, 0, 2)
os.lseek(fd, 0, 0)
m = mmap.mmap(fd, size, prot=mmap.PROT_READ)
return m, size, fd
SEEK_SET = 0
SEEK_CUR = 1
class Searcher:
def __init__(self, file):
self.file = file
self.map, self.size, self.fd = do_mmap(file)
def close(self):
self.map.close()
os.close(self.fd)
def find_newline(self):
self.map.readline()
return self.map.tell()
def binary_search(self, q):
pos = 0
start = 0
end = self.size
found = False
#this can get stuck with start = xxx and end = xxx+1, probably from the \r\n
while start < end - 2:
mid = start + (end-start)//2
self.map.seek(mid)
pos = self.find_newline()
if pos > end:
break
line = self.map.readline()
if q < line:
end = mid
elif q > line:
start = mid
while True:
line = self.map.readline()
if not line.startswith(q): break
yield line
if __name__ == "__main__":
import sys
q = sys.argv[1]
s = Searcher("pwned-passwords-sha1-ordered-by-hash-v6.txt")
import time
ss = time.perf_counter()
res = s.binary_search(q.upper().encode())
for x in res:
print(x)
ee = time.perf_counter()
print(ee-ss)