Functions/Helper/Get-IBHFileEncoding.ps1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
<#
.SYNOPSIS Guess the encoding of the specified file. .DESCRIPTION First we read the first 4 bytes of a file .OUTPUTS System.Text.Encoding. Encoding of the file. .EXAMPLE PS C:\> Get-IBHFileEncoding -Path 'C:\Temp\demo.txt' Guess the encoding of the demo.txt file. .LINK https://github.com/claudiospizzi/PSInvokeBuildHelper #> function Get-IBHFileEncoding { [CmdletBinding()] [OutputType([System.Text.Encoding])] param ( # Path to the file. [Parameter(Mandatory = $true)] [System.String] $Path ) # Read the first 4 bytes of the file. if ($PSVersionTable.PSVersion.Major -lt 6) { [System.Byte[]] $bytes = Get-Content -Path $Path -TotalCount 4 -Encoding 'Byte' } else { [System.Byte[]] $bytes = Get-Content -Path $Path -TotalCount 4 -AsByteStream } # Binary # Read the first 5 lines of the file and check them for non printable # charactres. If we find any, it's a binary file. $nonPrintable = [System.Char[]] (0..8 + 10..31 + 127 + 129 + 141 + 143 + 144 + 157) $affectedLineCount = Get-Content -Path $Path -TotalCount 5 | Where-Object { $_.IndexOfAny($nonPrintable) -ne -1 } | Measure-Object | Select-Object -ExpandProperty 'Count' if ($affectedLineCount -gt 0) { throw 'Binary files have no encoding!' } # UTF8 (EF BB BF) if ($bytes.Length -ge 3 -and $bytes[0] -eq 0xef -and $bytes[1] -eq 0xbb -and $bytes[2] -eq 0xbf) { return [System.Text.Encoding]::UTF8 } # UTF16 Big-Endian (FE FF) if ($bytes.Length -ge 2 -and $bytes[0] -eq 0xfe -and $bytes[1] -eq 0xff) { return [System.Text.Encoding]::BigEndianUnicode } # UTF16 Little-Endian (FF FE) if ($bytes.Length -ge 2 -and $bytes[0] -eq 0xff -and $bytes[1] -eq 0xfe) { return [System.Text.Encoding]::Unicode } # UTF32 Big-Endian (00 00 FE FF) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0x00 -and $bytes[1] -eq 0x00 -and $bytes[2] -eq 0xfe -and $bytes[3] -eq 0xff) { return [System.Text.Encoding]::UTF32 } # UTF32 Little-Endian (FE FF 00 00) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0xfe -and $bytes[1] -eq 0xff -and $bytes[2] -eq 0x00 -and $bytes[3] -eq 0x00) { return [System.Text.Encoding]::UTF32 } # UTF7 (2B 2F 76 38|38|2B|2F) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0x2b -and $bytes[1] -eq 0x2f -and $bytes[2] -eq 0x76 -and ($bytes[3] -eq 0x38 -or $bytes[3] -eq 0x39 -or $bytes[3] -eq 0x2b -or $bytes[3] -eq 0x2f)) { throw 'UTF7 is not a supported encoding!' } # UTF-1 (F7 64 4C) if ($bytes.Length -ge 3 -and $bytes[0] -eq 0xf7 -and $bytes[1] -eq 0x64 -and $bytes[2] -eq 0x4c ) { throw 'UTF-1 is not a supported encoding!' } # UTF-EBCDIC (DD 73 66 73) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0xdd -and $bytes[1] -eq 0x73 -and $bytes[2] -eq 0x66 -and $bytes[3] -eq 0x73) { throw 'UTF-EBCDIC is not a supported encoding!' } # SCSU (0E FE FF) if ($bytes.Length -ge 3 -and $bytes[0] -eq 0x0e -and $bytes[1] -eq 0xfe -and $bytes[2] -eq 0xff) { throw 'SCSU is not a supported encoding!' } # BOCU-1 (FB EE 28) if ($bytes.Length -ge 3 -and $bytes[0] -eq 0xfb -and $bytes[1] -eq 0xee -and $bytes[2] -eq 0x28 ) { throw 'BOCU-1 is not a supported encoding!' } # GB-18030 (84 31 95 33) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0x84 -and $bytes[1] -eq 0x31 -and $bytes[2] -eq 0x95 -and $bytes[3] -eq 0x33) { throw 'GB-18030 is not a supported encoding!' } # If the function will reach this point, the encoding was NOT found by # parsing the BOM header. Starting from here, we are guessing based on the # file content. # We are checking, if any byte has a value greather than 127, this indicates # it's a UTF8 encoded file. if ($bytes -notmatch '^[\x00-\x7F]*$') { return [System.Text.Encoding]::UTF8 } else { return [System.Text.Encoding]::ASCII } } |