$tesseractPath = "tesseract.exe" $inputImage = "input.jpg" $tsvData = & $tesseractPath $inputImage stdout -l eng --psm 6 ` -c tessedit_char_whitelist=" abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789[]{}()=+-*/\|&$%^#@!~`';:,<>.?_" ` tsv # Parse as tab separated values $lines = $tsvData -split "`r?`n" $headers = $lines[0] -split "`t" $records = $lines[1..($lines.Length - 1)] | Where-Object { $_.Trim() -ne "" } | ForEach-Object { $values = $_ -split "`t" if ($values.Length -eq $headers.Length) { $obj = @{} for ($i = 0; $i -lt $headers.Length; $i++) { $obj[$headers[$i]] = $values[$i] } [PSCustomObject]$obj } } # Level 5 is "word". Would have been nice to use level 4 "line" but that has no text # So, instead, we grab level 5 "line" $words = $records | Where-Object { $_.level -eq '5' -and $_.text -match '\S' } # Now group them by line number to get the text by line $groupedLines = $words | Group-Object line_num # Assume the first left position > 0 is the size of the indents $minLeft = ($words | Where-Object { [int]$_.left -gt 0 } | Measure-Object -Property left -Minimum).Minimum # Set tab width to that minimum value $tabWidth = [int]$minLeft # Now create your indented text foreach ($line in $groupedLines) { $sortedWords = $line.Group | Sort-Object { [int]$_.left } $firstLeft = [int]$sortedWords[0].left $indent = [math]::Round($firstLeft / $tabWidth) $textLine = ($sortedWords | ForEach-Object { $_.text }) -join ' ' Write-Output (' ' * $indent + $textLine) }