This results in a kind of innerText you get in browsers, great and light to pass into LLMs.
defp extract_inner_text(html) do html |> Floki.parse_document!() |> Floki.find("body") |> Floki.traverse_and_update(fn {tag, _attrs, _children} = _node when tag in ["script", "style"] -> nil node -> node end) |> Floki.text(sep: " ") |> String.trim() |> String.replace(~r/\s+/, " ") end
This results in a kind of innerText you get in browsers, great and light to pass into LLMs.