[NOTEPAD] Encoding detection (#1852)
[reactos.git] / base / applications / notepad / text.c
1 /*
2 * Notepad (text.c)
3 *
4 * Copyright 1998,99 Marcel Baur <mbaur@g26.ethz.ch>
5 * Copyright 2002 Sylvain Petreolle <spetreolle@yahoo.fr>
6 * Copyright 2002 Andriy Palamarchuk
7 * Copyright 2019 Katayama Hirofumi MZ <katayama.hirofumi.mz@gmail.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "notepad.h"
25
26 static BOOL Append(LPWSTR *ppszText, DWORD *pdwTextLen, LPCWSTR pszAppendText, DWORD dwAppendLen)
27 {
28 LPWSTR pszNewText;
29
30 if (dwAppendLen > 0)
31 {
32 if (*ppszText)
33 {
34 pszNewText = (LPWSTR) HeapReAlloc(GetProcessHeap(), 0, *ppszText, (*pdwTextLen + dwAppendLen) * sizeof(WCHAR));
35 }
36 else
37 {
38 pszNewText = (LPWSTR) HeapAlloc(GetProcessHeap(), 0, dwAppendLen * sizeof(WCHAR));
39 }
40
41 if (!pszNewText)
42 return FALSE;
43
44 memcpy(pszNewText + *pdwTextLen, pszAppendText, dwAppendLen * sizeof(WCHAR));
45 *ppszText = pszNewText;
46 *pdwTextLen += dwAppendLen;
47 }
48 return TRUE;
49 }
50
51 ENCODING AnalyzeEncoding(const char *pBytes, DWORD dwSize)
52 {
53 INT flags = IS_TEXT_UNICODE_STATISTICS;
54
55 if (dwSize <= 1)
56 return ENCODING_ANSI;
57
58 if (IsTextUnicode(pBytes, dwSize, &flags))
59 {
60 return ENCODING_UTF16LE;
61 }
62
63 if ((flags & IS_TEXT_UNICODE_REVERSE_MASK) && !(flags & IS_TEXT_UNICODE_ILLEGAL_CHARS))
64 {
65 return ENCODING_UTF16BE;
66 }
67
68 /* is it UTF-8? */
69 if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pBytes, dwSize, NULL, 0))
70 {
71 return ENCODING_UTF8;
72 }
73
74 return ENCODING_ANSI;
75 }
76
77 BOOL
78 ReadText(HANDLE hFile, LPWSTR *ppszText, DWORD *pdwTextLen, ENCODING *pencFile, int *piEoln)
79 {
80 DWORD dwSize;
81 LPBYTE pBytes = NULL;
82 LPWSTR pszText;
83 LPWSTR pszAllocText = NULL;
84 DWORD dwPos, i;
85 DWORD dwCharCount;
86 BOOL bSuccess = FALSE;
87 BYTE b = 0;
88 ENCODING encFile = ENCODING_ANSI;
89 int iCodePage = 0;
90 WCHAR szCrlf[2] = {'\r', '\n'};
91 DWORD adwEolnCount[3] = {0, 0, 0};
92
93 *ppszText = NULL;
94 *pdwTextLen = 0;
95
96 dwSize = GetFileSize(hFile, NULL);
97 if (dwSize == INVALID_FILE_SIZE)
98 goto done;
99
100 pBytes = HeapAlloc(GetProcessHeap(), 0, dwSize + 2);
101 if (!pBytes)
102 goto done;
103
104 if (!ReadFile(hFile, pBytes, dwSize, &dwSize, NULL))
105 goto done;
106 dwPos = 0;
107
108 /* Make sure that there is a NUL character at the end, in any encoding */
109 pBytes[dwSize + 0] = '\0';
110 pBytes[dwSize + 1] = '\0';
111
112 /* Look for Byte Order Marks */
113 if ((dwSize >= 2) && (pBytes[0] == 0xFF) && (pBytes[1] == 0xFE))
114 {
115 encFile = ENCODING_UTF16LE;
116 dwPos += 2;
117 }
118 else if ((dwSize >= 2) && (pBytes[0] == 0xFE) && (pBytes[1] == 0xFF))
119 {
120 encFile = ENCODING_UTF16BE;
121 dwPos += 2;
122 }
123 else if ((dwSize >= 3) && (pBytes[0] == 0xEF) && (pBytes[1] == 0xBB) && (pBytes[2] == 0xBF))
124 {
125 encFile = ENCODING_UTF8;
126 dwPos += 3;
127 }
128 else
129 {
130 encFile = AnalyzeEncoding((const char *)pBytes, dwSize);
131 }
132
133 switch(encFile)
134 {
135 case ENCODING_UTF16BE:
136 for (i = dwPos; i < dwSize-1; i += 2)
137 {
138 b = pBytes[i+0];
139 pBytes[i+0] = pBytes[i+1];
140 pBytes[i+1] = b;
141 }
142 /* fall through */
143
144 case ENCODING_UTF16LE:
145 pszText = (LPWSTR) &pBytes[dwPos];
146 dwCharCount = (dwSize - dwPos) / sizeof(WCHAR);
147 break;
148
149 case ENCODING_ANSI:
150 case ENCODING_UTF8:
151 if (encFile == ENCODING_ANSI)
152 iCodePage = CP_ACP;
153 else if (encFile == ENCODING_UTF8)
154 iCodePage = CP_UTF8;
155
156 if ((dwSize - dwPos) > 0)
157 {
158 dwCharCount = MultiByteToWideChar(iCodePage, 0, (LPCSTR)&pBytes[dwPos], dwSize - dwPos, NULL, 0);
159 if (dwCharCount == 0)
160 goto done;
161 }
162 else
163 {
164 /* special case for files with no characters (other than BOMs) */
165 dwCharCount = 0;
166 }
167
168 pszAllocText = (LPWSTR) HeapAlloc(GetProcessHeap(), 0, (dwCharCount + 1) * sizeof(WCHAR));
169 if (!pszAllocText)
170 goto done;
171
172 if ((dwSize - dwPos) > 0)
173 {
174 if (!MultiByteToWideChar(iCodePage, 0, (LPCSTR)&pBytes[dwPos], dwSize - dwPos, pszAllocText, dwCharCount))
175 goto done;
176 }
177
178 pszAllocText[dwCharCount] = '\0';
179 pszText = pszAllocText;
180 break;
181 DEFAULT_UNREACHABLE;
182 }
183
184 dwPos = 0;
185 for (i = 0; i < dwCharCount; i++)
186 {
187 switch(pszText[i])
188 {
189 case '\r':
190 if ((i < dwCharCount-1) && (pszText[i+1] == '\n'))
191 {
192 i++;
193 adwEolnCount[EOLN_CRLF]++;
194 break;
195 }
196 /* fall through */
197
198 case '\n':
199 if (!Append(ppszText, pdwTextLen, &pszText[dwPos], i - dwPos))
200 return FALSE;
201 if (!Append(ppszText, pdwTextLen, szCrlf, ARRAY_SIZE(szCrlf)))
202 return FALSE;
203 dwPos = i + 1;
204
205 if (pszText[i] == '\r')
206 adwEolnCount[EOLN_CR]++;
207 else
208 adwEolnCount[EOLN_LF]++;
209 break;
210
211 case '\0':
212 pszText[i] = ' ';
213 break;
214 }
215 }
216
217 if (!*ppszText && (pszText == pszAllocText))
218 {
219 /* special case; don't need to reallocate */
220 *ppszText = pszAllocText;
221 *pdwTextLen = dwCharCount;
222 pszAllocText = NULL;
223 }
224 else
225 {
226 /* append last remaining text */
227 if (!Append(ppszText, pdwTextLen, &pszText[dwPos], i - dwPos + 1))
228 return FALSE;
229 }
230
231 /* chose which eoln to use */
232 *piEoln = EOLN_CRLF;
233 if (adwEolnCount[EOLN_LF] > adwEolnCount[*piEoln])
234 *piEoln = EOLN_LF;
235 if (adwEolnCount[EOLN_CR] > adwEolnCount[*piEoln])
236 *piEoln = EOLN_CR;
237 *pencFile = encFile;
238
239 bSuccess = TRUE;
240
241 done:
242 if (pBytes)
243 HeapFree(GetProcessHeap(), 0, pBytes);
244 if (pszAllocText)
245 HeapFree(GetProcessHeap(), 0, pszAllocText);
246
247 if (!bSuccess && *ppszText)
248 {
249 HeapFree(GetProcessHeap(), 0, *ppszText);
250 *ppszText = NULL;
251 *pdwTextLen = 0;
252 }
253 return bSuccess;
254 }
255
256 static BOOL WriteEncodedText(HANDLE hFile, LPCWSTR pszText, DWORD dwTextLen, ENCODING encFile)
257 {
258 LPBYTE pBytes = NULL;
259 LPBYTE pAllocBuffer = NULL;
260 DWORD dwPos = 0;
261 DWORD dwByteCount;
262 BYTE buffer[1024];
263 UINT iCodePage = 0;
264 DWORD dwDummy, i;
265 BOOL bSuccess = FALSE;
266 int iBufferSize, iRequiredBytes;
267 BYTE b;
268
269 while(dwPos < dwTextLen)
270 {
271 switch(encFile)
272 {
273 case ENCODING_UTF16LE:
274 pBytes = (LPBYTE) &pszText[dwPos];
275 dwByteCount = (dwTextLen - dwPos) * sizeof(WCHAR);
276 dwPos = dwTextLen;
277 break;
278
279 case ENCODING_UTF16BE:
280 dwByteCount = (dwTextLen - dwPos) * sizeof(WCHAR);
281 if (dwByteCount > sizeof(buffer))
282 dwByteCount = sizeof(buffer);
283
284 memcpy(buffer, &pszText[dwPos], dwByteCount);
285 for (i = 0; i < dwByteCount; i += 2)
286 {
287 b = buffer[i+0];
288 buffer[i+0] = buffer[i+1];
289 buffer[i+1] = b;
290 }
291 pBytes = (LPBYTE) &buffer[dwPos];
292 dwPos += dwByteCount / sizeof(WCHAR);
293 break;
294
295 case ENCODING_ANSI:
296 case ENCODING_UTF8:
297 if (encFile == ENCODING_ANSI)
298 iCodePage = CP_ACP;
299 else if (encFile == ENCODING_UTF8)
300 iCodePage = CP_UTF8;
301
302 iRequiredBytes = WideCharToMultiByte(iCodePage, 0, &pszText[dwPos], dwTextLen - dwPos, NULL, 0, NULL, NULL);
303 if (iRequiredBytes <= 0)
304 {
305 goto done;
306 }
307 else if (iRequiredBytes < sizeof(buffer))
308 {
309 pBytes = buffer;
310 iBufferSize = sizeof(buffer);
311 }
312 else
313 {
314 pAllocBuffer = (LPBYTE) HeapAlloc(GetProcessHeap(), 0, iRequiredBytes);
315 if (!pAllocBuffer)
316 return FALSE;
317 pBytes = pAllocBuffer;
318 iBufferSize = iRequiredBytes;
319 }
320
321 dwByteCount = WideCharToMultiByte(iCodePage, 0, &pszText[dwPos], dwTextLen - dwPos, (LPSTR) pBytes, iBufferSize, NULL, NULL);
322 if (!dwByteCount)
323 goto done;
324
325 dwPos = dwTextLen;
326 break;
327
328 default:
329 goto done;
330 }
331
332 if (!WriteFile(hFile, pBytes, dwByteCount, &dwDummy, NULL))
333 goto done;
334
335 /* free the buffer, if we have allocated one */
336 if (pAllocBuffer)
337 {
338 HeapFree(GetProcessHeap(), 0, pAllocBuffer);
339 pAllocBuffer = NULL;
340 }
341 }
342 bSuccess = TRUE;
343
344 done:
345 if (pAllocBuffer)
346 HeapFree(GetProcessHeap(), 0, pAllocBuffer);
347 return bSuccess;
348 }
349
350 BOOL WriteText(HANDLE hFile, LPCWSTR pszText, DWORD dwTextLen, ENCODING encFile, int iEoln)
351 {
352 WCHAR wcBom;
353 LPCWSTR pszLF = L"\n";
354 DWORD dwPos, dwNext;
355
356 /* Write the proper byte order marks if not ANSI */
357 if (encFile != ENCODING_ANSI)
358 {
359 wcBom = 0xFEFF;
360 if (!WriteEncodedText(hFile, &wcBom, 1, encFile))
361 return FALSE;
362 }
363
364 dwPos = 0;
365
366 /* pszText eoln are always \r\n */
367
368 do
369 {
370 /* Find the next eoln */
371 dwNext = dwPos;
372 while(dwNext < dwTextLen)
373 {
374 if (pszText[dwNext] == '\r' && pszText[dwNext + 1] == '\n')
375 break;
376 dwNext++;
377 }
378
379 if (dwNext != dwTextLen)
380 {
381 switch (iEoln)
382 {
383 case EOLN_LF:
384 /* Write text (without eoln) */
385 if (!WriteEncodedText(hFile, &pszText[dwPos], dwNext - dwPos, encFile))
386 return FALSE;
387 /* Write eoln */
388 if (!WriteEncodedText(hFile, pszLF, 1, encFile))
389 return FALSE;
390 break;
391 case EOLN_CR:
392 /* Write text (including \r as eoln) */
393 if (!WriteEncodedText(hFile, &pszText[dwPos], dwNext - dwPos + 1, encFile))
394 return FALSE;
395 break;
396 case EOLN_CRLF:
397 /* Write text (including \r\n as eoln) */
398 if (!WriteEncodedText(hFile, &pszText[dwPos], dwNext - dwPos + 2, encFile))
399 return FALSE;
400 break;
401 default:
402 return FALSE;
403 }
404 }
405 else
406 {
407 /* Write text (without eoln, since this is the end of the file) */
408 if (!WriteEncodedText(hFile, &pszText[dwPos], dwNext - dwPos, encFile))
409 return FALSE;
410 }
411
412 /* Skip \r\n */
413 dwPos = dwNext + 2;
414 }
415 while (dwPos < dwTextLen);
416
417 return TRUE;
418 }