55import sys
66import tempfile
77import textwrap
8+ import typing
9+ import urllib .parse
10+ import urllib .request
811from pathlib import Path
912from typing import Iterable , Iterator , Union
1013
1114from .handler import _check_log_handler , logger
1215from .pandoc_download import DEFAULT_TARGET_FOLDER , download_pandoc
13- from .py3compat import cast_bytes , cast_unicode , url2path , urlparse
1416
1517__author__ = "Juho Vepsäläinen"
1618__author_email__ = "bebraw@gmail.com"
5355
5456
5557def convert_text (
56- source : str ,
58+ source : typing . Union [ str , bytes ] ,
5759 to : str ,
5860 format : str ,
5961 extra_args : Iterable = (),
@@ -66,7 +68,7 @@ def convert_text(
6668) -> str :
6769 """Converts given `source` from `format` to `to`.
6870
69- :param str source: Unicode string or bytes (see encoding)
71+ :param source: Unicode string or bytes (see encoding)
7072
7173 :param str to: format into which the input should be converted;
7274 can be one of `pypandoc.get_pandoc_formats()[1]`
@@ -106,7 +108,10 @@ def convert_text(
106108 if pandoc is not found; make sure it has been installed
107109 and is available at path.
108110 """
109- source = _as_unicode (source , encoding )
111+
112+ if isinstance (source , bytes ):
113+ source = source .decode (encoding , errors = "ignore" )
114+
110115 return _convert_input (
111116 source ,
112117 format ,
@@ -286,7 +291,7 @@ def _identify_path(source) -> bool:
286291 if not is_path :
287292 try :
288293 # check if it's an URL
289- result = urlparse (source )
294+ result = urllib . parse . urlparse (source )
290295 if result .scheme in ["http" , "https" ]:
291296 is_path = True
292297 elif result .scheme and result .netloc and result .path :
@@ -303,7 +308,7 @@ def _identify_path(source) -> bool:
303308def _is_network_path (source ):
304309 try :
305310 # check if it's an URL
306- result = urlparse (source )
311+ result = urllib . parse . urlparse (source )
307312 if result .scheme in ["http" , "https" ]:
308313 return True
309314 elif result .scheme and result .netloc and result .path :
@@ -320,17 +325,6 @@ def _identify_format_from_path(sourcefile: str, format: str) -> str:
320325 return format or os .path .splitext (sourcefile )[1 ].strip ("." )
321326
322327
323- def _as_unicode (source : any , encoding : str ) -> any :
324- if encoding != "utf-8" :
325- # if a source and a different encoding is given,
326- # try to decode the source into a string
327- try :
328- source = cast_unicode (source , encoding = encoding )
329- except (UnicodeDecodeError , UnicodeEncodeError ):
330- pass
331- return source
332-
333-
334328def normalize_format (fmt ):
335329 formats = {
336330 "dbk" : "docbook" ,
@@ -404,7 +398,7 @@ def _validate_formats(format, to, outputfile):
404398
405399
406400def _convert_input (
407- source ,
401+ source : str ,
408402 format ,
409403 input_type ,
410404 to ,
@@ -509,19 +503,9 @@ def _convert_input(
509503 )
510504
511505 if string_input :
512- try :
513- source = cast_bytes (source , encoding = "utf-8" )
514- except (UnicodeDecodeError , UnicodeEncodeError ):
515- # assume that it is already a utf-8 encoded string
516- pass
517- try :
518- stdout , stderr = p .communicate (source if string_input else None )
519- except OSError :
520- # this is happening only on Py2.6 when pandoc dies before reading all
521- # the input. We treat that the same as when we exit with an error...
522- raise RuntimeError (
523- 'Pandoc died with exitcode "%s" during conversion.' % (p .returncode )
524- )
506+ if isinstance (source , str ):
507+ source = source .encode ("utf-8" )
508+ stdout , stderr = p .communicate (source if string_input else None )
525509
526510 try :
527511 if not (to in ["odt" , "docx" , "epub" , "epub3" , "pdf" ] and outputfile == "-" ):
@@ -957,6 +941,10 @@ def ensure_pandoc_installed(
957941 _ensure_pandoc_path ()
958942
959943
944+ def url2path (url ):
945+ return urllib .request .url2pathname (urllib .parse .urlparse (url ).path )
946+
947+
960948# -----------------------------------------------------------------------------
961949# Internal state management
962950# -----------------------------------------------------------------------------
0 commit comments