Hacker News new | past | comments | ask | show | jobs | submit login

You can make python binary search super fast if you use mmap. here's a version of that I had lying around, it's probably correct.

  import os
  import mmap
  
  def do_mmap(f):
      fd = os.open(f, os.O_RDONLY)
      size = os.lseek(fd, 0, 2)
      os.lseek(fd, 0, 0)
      m = mmap.mmap(fd, size, prot=mmap.PROT_READ)
      return m, size, fd
  
  SEEK_SET = 0
  SEEK_CUR = 1
  
  class Searcher:
      def __init__(self, file):
          self.file = file
          self.map, self.size, self.fd = do_mmap(file)
  
      def close(self):
          self.map.close()
          os.close(self.fd)
  
      def find_newline(self):
          self.map.readline()
          return self.map.tell()
  
      def binary_search(self, q):
          pos = 0
          start = 0
          end = self.size
          found = False
          #this can get stuck with start = xxx and end = xxx+1, probably from the \r\n
          while start < end - 2:
              mid = start + (end-start)//2
              self.map.seek(mid)
              pos = self.find_newline()
              if pos > end:
                  break
              line = self.map.readline()
              if q < line:
                  end = mid
              elif q > line:
                  start = mid
  
          while True:
              line = self.map.readline()
              if not line.startswith(q): break
              yield line
  
  if __name__ == "__main__":
      import sys
      q = sys.argv[1]
      s = Searcher("pwned-passwords-sha1-ordered-by-hash-v6.txt")
      import time
      ss = time.perf_counter()
      res = s.binary_search(q.upper().encode())
      for x in res:
          print(x)
      ee = time.perf_counter()
      print(ee-ss)



I did try mmap, both with the plaintext binary search, and with the binary file (you can find a note about it in the HTML source :)

I ended up not mentioning it because for some reason, it was ~twice as slow on my mac... I'm now curious to try it on a decent Linux machine.




Guidelines | FAQ | Lists | API | Security | Legal | Apply to YC | Contact

Search: