""" psycopg copy support """ # Copyright (C) 2020 The Psycopg Team import re import queue import struct import asyncio import threading from abc import ABC, abstractmethod from types import TracebackType from typing import Any, AsyncIterator, Dict, Generic, Iterator, List, Match, IO from typing import Optional, Sequence, Tuple, Type, TypeVar, Union, TYPE_CHECKING from . import pq from . import adapt from . import errors as e from .abc import Buffer, ConnectionType, PQGen, Transformer from ._compat import create_task from .pq.misc import connection_summary from ._cmodule import _psycopg from ._encodings import pgconn_encoding from .generators import copy_from, copy_to, copy_end if TYPE_CHECKING: from .cursor import BaseCursor, Cursor from .cursor_async import AsyncCursor from .connection import Connection # noqa: F401 from .connection_async import AsyncConnection # noqa: F401 PY_TEXT = adapt.PyFormat.TEXT PY_BINARY = adapt.PyFormat.BINARY TEXT = pq.Format.TEXT BINARY = pq.Format.BINARY COPY_IN = pq.ExecStatus.COPY_IN COPY_OUT = pq.ExecStatus.COPY_OUT ACTIVE = pq.TransactionStatus.ACTIVE # Size of data to accumulate before sending it down the network. We fill a # buffer this size field by field, and when it passes the threshold size # we ship it, so it may end up being bigger than this. BUFFER_SIZE = 32 * 1024 # Maximum data size we want to queue to send to the libpq copy. Sending a # buffer too big to be handled can cause an infinite loop in the libpq # (#255) so we want to split it in more digestable chunks. MAX_BUFFER_SIZE = 4 * BUFFER_SIZE # Note: making this buffer too large, e.g. # MAX_BUFFER_SIZE = 1024 * 1024 # makes operations *way* slower! Probably triggering some quadraticity # in the libpq memory management and data sending. # Max size of the write queue of buffers. More than that copy will block # Each buffer should be around BUFFER_SIZE size. QUEUE_SIZE = 1024 class BaseCopy(Generic[ConnectionType]): """ Base implementation for the copy user interface. Two subclasses expose real methods with the sync/async differences. The difference between the text and binary format is managed by two different `Formatter` subclasses. Writing (the I/O part) is implemented in the subclasses by a `Writer` or `AsyncWriter` instance. Normally writing implies sending copy data to a database, but a different writer might be chosen, e.g. to stream data into a file for later use. """ _Self = TypeVar("_Self", bound="BaseCopy[Any]") formatter: "Formatter" def __init__( self, cursor: "BaseCursor[ConnectionType, Any]", *, binary: Optional[bool] = None, ): self.cursor = cursor self.connection = cursor.connection self._pgconn = self.connection.pgconn result = cursor.pgresult if result: self._direction = result.status if self._direction != COPY_IN and self._direction != COPY_OUT: raise e.ProgrammingError( "the cursor should have performed a COPY operation;" f" its status is {pq.ExecStatus(self._direction).name} instead" ) else: self._direction = COPY_IN if binary is None: binary = bool(result and result.binary_tuples) tx: Transformer = getattr(cursor, "_tx", None) or adapt.Transformer(cursor) if binary: self.formatter = BinaryFormatter(tx) else: self.formatter = TextFormatter(tx, encoding=pgconn_encoding(self._pgconn)) self._finished = False def __repr__(self) -> str: cls = f"{self.__class__.__module__}.{self.__class__.__qualname__}" info = connection_summary(self._pgconn) return f"<{cls} {info} at 0x{id(self):x}>" def _enter(self) -> None: if self._finished: raise TypeError("copy blocks can be used only once") def set_types(self, types: Sequence[Union[int, str]]) -> None: """ Set the types expected in a COPY operation. The types must be specified as a sequence of oid or PostgreSQL type names (e.g. ``int4``, ``timestamptz[]``). This operation overcomes the lack of metadata returned by PostgreSQL when a COPY operation begins: - On :sql:`COPY TO`, `!set_types()` allows to specify what types the operation returns. If `!set_types()` is not used, the data will be returned as unparsed strings or bytes instead of Python objects. - On :sql:`COPY FROM`, `!set_types()` allows to choose what type the database expects. This is especially useful in binary copy, because PostgreSQL will apply no cast rule. """ registry = self.cursor.adapters.types oids = [t if isinstance(t, int) else registry.get_oid(t) for t in types] if self._direction == COPY_IN: self.formatter.transformer.set_dumper_types(oids, self.formatter.format) else: self.formatter.transformer.set_loader_types(oids, self.formatter.format) # High level copy protocol generators (state change of the Copy object) def _read_gen(self) -> PQGen[Buffer]: if self._finished: return memoryview(b"") res = yield from copy_from(self._pgconn) if isinstance(res, memoryview): return res # res is the final PGresult self._finished = True # This result is a COMMAND_OK which has info about the number of rows # returned, but not about the columns, which is instead an information # that was received on the COPY_OUT result at the beginning of COPY. # So, don't replace the results in the cursor, just update the rowcount. nrows = res.command_tuples self.cursor._rowcount = nrows if nrows is not None else -1 return memoryview(b"") def _read_row_gen(self) -> PQGen[Optional[Tuple[Any, ...]]]: data = yield from self._read_gen() if not data: return None row = self.formatter.parse_row(data) if row is None: # Get the final result to finish the copy operation yield from self._read_gen() self._finished = True return None return row def _end_copy_out_gen(self, exc: Optional[BaseException]) -> PQGen[None]: if not exc: return if self._pgconn.transaction_status != ACTIVE: # The server has already finished to send copy data. The connection # is already in a good state. return # Throw a cancel to the server, then consume the rest of the copy data # (which might or might not have been already transferred entirely to # the client, so we won't necessary see the exception associated with # canceling). self.connection.cancel() try: while (yield from self._read_gen()): pass except e.QueryCanceled: pass class Copy(BaseCopy["Connection[Any]"]): """Manage a :sql:`COPY` operation. :param cursor: the cursor where the operation is performed. :param binary: if `!True`, write binary format. :param writer: the object to write to destination. If not specified, write to the `!cursor` connection. Choosing `!binary` is not necessary if the cursor has executed a :sql:`COPY` operation, because the operation result describes the format too. The parameter is useful when a `!Copy` object is created manually and no operation is performed on the cursor, such as when using ``writer=``\\ `~psycopg.copy.FileWriter`. """ __module__ = "psycopg" writer: "Writer" def __init__( self, cursor: "Cursor[Any]", *, binary: Optional[bool] = None, writer: Optional["Writer"] = None, ): super().__init__(cursor, binary=binary) if not writer: writer = LibpqWriter(cursor) self.writer = writer self._write = writer.write def __enter__(self: BaseCopy._Self) -> BaseCopy._Self: self._enter() return self def __exit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: self.finish(exc_val) # End user sync interface def __iter__(self) -> Iterator[Buffer]: """Implement block-by-block iteration on :sql:`COPY TO`.""" while True: data = self.read() if not data: break yield data def read(self) -> Buffer: """ Read an unparsed row after a :sql:`COPY TO` operation. Return an empty string when the data is finished. """ return self.connection.wait(self._read_gen()) def rows(self) -> Iterator[Tuple[Any, ...]]: """ Iterate on the result of a :sql:`COPY TO` operation record by record. Note that the records returned will be tuples of unparsed strings or bytes, unless data types are specified using `set_types()`. """ while True: record = self.read_row() if record is None: break yield record def read_row(self) -> Optional[Tuple[Any, ...]]: """ Read a parsed row of data from a table after a :sql:`COPY TO` operation. Return `!None` when the data is finished. Note that the records returned will be tuples of unparsed strings or bytes, unless data types are specified using `set_types()`. """ return self.connection.wait(self._read_row_gen()) def write(self, buffer: Union[Buffer, str]) -> None: """ Write a block of data to a table after a :sql:`COPY FROM` operation. If the :sql:`COPY` is in binary format `!buffer` must be `!bytes`. In text mode it can be either `!bytes` or `!str`. """ data = self.formatter.write(buffer) if data: self._write(data) def write_row(self, row: Sequence[Any]) -> None: """Write a record to a table after a :sql:`COPY FROM` operation.""" data = self.formatter.write_row(row) if data: self._write(data) def finish(self, exc: Optional[BaseException]) -> None: """Terminate the copy operation and free the resources allocated. You shouldn't need to call this function yourself: it is usually called by exit. It is available if, despite what is documented, you end up using the `Copy` object outside a block. """ if self._direction == COPY_IN: data = self.formatter.end() if data: self._write(data) self.writer.finish(exc) self._finished = True else: self.connection.wait(self._end_copy_out_gen(exc)) class Writer(ABC): """ A class to write copy data somewhere. """ @abstractmethod def write(self, data: Buffer) -> None: """ Write some data to destination. """ ... def finish(self, exc: Optional[BaseException] = None) -> None: """ Called when write operations are finished. If operations finished with an error, it will be passed to ``exc``. """ pass class LibpqWriter(Writer): """ A `Writer` to write copy data to a Postgres database. """ def __init__(self, cursor: "Cursor[Any]"): self.cursor = cursor self.connection = cursor.connection self._pgconn = self.connection.pgconn def write(self, data: Buffer) -> None: if len(data) <= MAX_BUFFER_SIZE: # Most used path: we don't need to split the buffer in smaller # bits, so don't make a copy. self.connection.wait(copy_to(self._pgconn, data)) else: # Copy a buffer too large in chunks to avoid causing a memory # error in the libpq, which may cause an infinite loop (#255). for i in range(0, len(data), MAX_BUFFER_SIZE): self.connection.wait( copy_to(self._pgconn, data[i : i + MAX_BUFFER_SIZE]) ) def finish(self, exc: Optional[BaseException] = None) -> None: bmsg: Optional[bytes] if exc: msg = f"error from Python: {type(exc).__qualname__} - {exc}" bmsg = msg.encode(pgconn_encoding(self._pgconn), "replace") else: bmsg = None try: res = self.connection.wait(copy_end(self._pgconn, bmsg)) # The QueryCanceled is expected if we sent an exception message to # pgconn.put_copy_end(). The Python exception that generated that # cancelling is more important, so don't clobber it. except e.QueryCanceled: if not bmsg: raise else: self.cursor._results = [res] class QueuedLibpqWriter(LibpqWriter): """ A writer using a buffer to queue data to write to a Postgres database. `write()` returns immediately, so that the main thread can be CPU-bound formatting messages, while a worker thread can be IO-bound waiting to write on the connection. """ def __init__(self, cursor: "Cursor[Any]"): super().__init__(cursor) self._queue: queue.Queue[Buffer] = queue.Queue(maxsize=QUEUE_SIZE) self._worker: Optional[threading.Thread] = None self._worker_error: Optional[BaseException] = None def worker(self) -> None: """Push data to the server when available from the copy queue. Terminate reading when the queue receives a false-y value, or in case of error. The function is designed to be run in a separate thread. """ try: while True: data = self._queue.get(block=True, timeout=24 * 60 * 60) if not data: break self.connection.wait(copy_to(self._pgconn, data)) except BaseException as ex: # Propagate the error to the main thread. self._worker_error = ex def write(self, data: Buffer) -> None: if not self._worker: # warning: reference loop, broken by _write_end self._worker = threading.Thread(target=self.worker) self._worker.daemon = True self._worker.start() # If the worker thread raies an exception, re-raise it to the caller. if self._worker_error: raise self._worker_error if len(data) <= MAX_BUFFER_SIZE: # Most used path: we don't need to split the buffer in smaller # bits, so don't make a copy. self._queue.put(data) else: # Copy a buffer too large in chunks to avoid causing a memory # error in the libpq, which may cause an infinite loop (#255). for i in range(0, len(data), MAX_BUFFER_SIZE): self._queue.put(data[i : i + MAX_BUFFER_SIZE]) def finish(self, exc: Optional[BaseException] = None) -> None: self._queue.put(b"") if self._worker: self._worker.join() self._worker = None # break the loop # Check if the worker thread raised any exception before terminating. if self._worker_error: raise self._worker_error super().finish(exc) class FileWriter(Writer): """ A `Writer` to write copy data to a file-like object. :param file: the file where to write copy data. It must be open for writing in binary mode. """ def __init__(self, file: IO[bytes]): self.file = file def write(self, data: Buffer) -> None: self.file.write(data) class AsyncCopy(BaseCopy["AsyncConnection[Any]"]): """Manage an asynchronous :sql:`COPY` operation.""" __module__ = "psycopg" writer: "AsyncWriter" def __init__( self, cursor: "AsyncCursor[Any]", *, binary: Optional[bool] = None, writer: Optional["AsyncWriter"] = None, ): super().__init__(cursor, binary=binary) if not writer: writer = AsyncLibpqWriter(cursor) self.writer = writer self._write = writer.write async def __aenter__(self: BaseCopy._Self) -> BaseCopy._Self: self._enter() return self async def __aexit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: await self.finish(exc_val) async def __aiter__(self) -> AsyncIterator[Buffer]: while True: data = await self.read() if not data: break yield data async def read(self) -> Buffer: return await self.connection.wait(self._read_gen()) async def rows(self) -> AsyncIterator[Tuple[Any, ...]]: while True: record = await self.read_row() if record is None: break yield record async def read_row(self) -> Optional[Tuple[Any, ...]]: return await self.connection.wait(self._read_row_gen()) async def write(self, buffer: Union[Buffer, str]) -> None: data = self.formatter.write(buffer) if data: await self._write(data) async def write_row(self, row: Sequence[Any]) -> None: data = self.formatter.write_row(row) if data: await self._write(data) async def finish(self, exc: Optional[BaseException]) -> None: if self._direction == COPY_IN: data = self.formatter.end() if data: await self._write(data) await self.writer.finish(exc) self._finished = True else: await self.connection.wait(self._end_copy_out_gen(exc)) class AsyncWriter(ABC): """ A class to write copy data somewhere (for async connections). """ @abstractmethod async def write(self, data: Buffer) -> None: ... async def finish(self, exc: Optional[BaseException] = None) -> None: pass class AsyncLibpqWriter(AsyncWriter): """ An `AsyncWriter` to write copy data to a Postgres database. """ def __init__(self, cursor: "AsyncCursor[Any]"): self.cursor = cursor self.connection = cursor.connection self._pgconn = self.connection.pgconn async def write(self, data: Buffer) -> None: if len(data) <= MAX_BUFFER_SIZE: # Most used path: we don't need to split the buffer in smaller # bits, so don't make a copy. await self.connection.wait(copy_to(self._pgconn, data)) else: # Copy a buffer too large in chunks to avoid causing a memory # error in the libpq, which may cause an infinite loop (#255). for i in range(0, len(data), MAX_BUFFER_SIZE): await self.connection.wait( copy_to(self._pgconn, data[i : i + MAX_BUFFER_SIZE]) ) async def finish(self, exc: Optional[BaseException] = None) -> None: bmsg: Optional[bytes] if exc: msg = f"error from Python: {type(exc).__qualname__} - {exc}" bmsg = msg.encode(pgconn_encoding(self._pgconn), "replace") else: bmsg = None try: res = await self.connection.wait(copy_end(self._pgconn, bmsg)) # The QueryCanceled is expected if we sent an exception message to # pgconn.put_copy_end(). The Python exception that generated that # cancelling is more important, so don't clobber it. except e.QueryCanceled: if not bmsg: raise else: self.cursor._results = [res] class AsyncQueuedLibpqWriter(AsyncLibpqWriter): """ An `AsyncWriter` using a buffer to queue data to write. `write()` returns immediately, so that the main thread can be CPU-bound formatting messages, while a worker thread can be IO-bound waiting to write on the connection. """ def __init__(self, cursor: "AsyncCursor[Any]"): super().__init__(cursor) self._queue: asyncio.Queue[Buffer] = asyncio.Queue(maxsize=QUEUE_SIZE) self._worker: Optional[asyncio.Future[None]] = None async def worker(self) -> None: """Push data to the server when available from the copy queue. Terminate reading when the queue receives a false-y value. The function is designed to be run in a separate task. """ while True: data = await self._queue.get() if not data: break await self.connection.wait(copy_to(self._pgconn, data)) async def write(self, data: Buffer) -> None: if not self._worker: self._worker = create_task(self.worker()) if len(data) <= MAX_BUFFER_SIZE: # Most used path: we don't need to split the buffer in smaller # bits, so don't make a copy. await self._queue.put(data) else: # Copy a buffer too large in chunks to avoid causing a memory # error in the libpq, which may cause an infinite loop (#255). for i in range(0, len(data), MAX_BUFFER_SIZE): await self._queue.put(data[i : i + MAX_BUFFER_SIZE]) async def finish(self, exc: Optional[BaseException] = None) -> None: await self._queue.put(b"") if self._worker: await asyncio.gather(self._worker) self._worker = None # break reference loops if any await super().finish(exc) class Formatter(ABC): """ A class which understand a copy format (text, binary). """ format: pq.Format def __init__(self, transformer: Transformer): self.transformer = transformer self._write_buffer = bytearray() self._row_mode = False # true if the user is using write_row() @abstractmethod def parse_row(self, data: Buffer) -> Optional[Tuple[Any, ...]]: ... @abstractmethod def write(self, buffer: Union[Buffer, str]) -> Buffer: ... @abstractmethod def write_row(self, row: Sequence[Any]) -> Buffer: ... @abstractmethod def end(self) -> Buffer: ... class TextFormatter(Formatter): format = TEXT def __init__(self, transformer: Transformer, encoding: str = "utf-8"): super().__init__(transformer) self._encoding = encoding def parse_row(self, data: Buffer) -> Optional[Tuple[Any, ...]]: if data: return parse_row_text(data, self.transformer) else: return None def write(self, buffer: Union[Buffer, str]) -> Buffer: data = self._ensure_bytes(buffer) self._signature_sent = True return data def write_row(self, row: Sequence[Any]) -> Buffer: # Note down that we are writing in row mode: it means we will have # to take care of the end-of-copy marker too self._row_mode = True format_row_text(row, self.transformer, self._write_buffer) if len(self._write_buffer) > BUFFER_SIZE: buffer, self._write_buffer = self._write_buffer, bytearray() return buffer else: return b"" def end(self) -> Buffer: buffer, self._write_buffer = self._write_buffer, bytearray() return buffer def _ensure_bytes(self, data: Union[Buffer, str]) -> Buffer: if isinstance(data, str): return data.encode(self._encoding) else: # Assume, for simplicity, that the user is not passing stupid # things to the write function. If that's the case, things # will fail downstream. return data class BinaryFormatter(Formatter): format = BINARY def __init__(self, transformer: Transformer): super().__init__(transformer) self._signature_sent = False def parse_row(self, data: Buffer) -> Optional[Tuple[Any, ...]]: if not self._signature_sent: if data[: len(_binary_signature)] != _binary_signature: raise e.DataError( "binary copy doesn't start with the expected signature" ) self._signature_sent = True data = data[len(_binary_signature) :] elif data == _binary_trailer: return None return parse_row_binary(data, self.transformer) def write(self, buffer: Union[Buffer, str]) -> Buffer: data = self._ensure_bytes(buffer) self._signature_sent = True return data def write_row(self, row: Sequence[Any]) -> Buffer: # Note down that we are writing in row mode: it means we will have # to take care of the end-of-copy marker too self._row_mode = True if not self._signature_sent: self._write_buffer += _binary_signature self._signature_sent = True format_row_binary(row, self.transformer, self._write_buffer) if len(self._write_buffer) > BUFFER_SIZE: buffer, self._write_buffer = self._write_buffer, bytearray() return buffer else: return b"" def end(self) -> Buffer: # If we have sent no data we need to send the signature # and the trailer if not self._signature_sent: self._write_buffer += _binary_signature self._write_buffer += _binary_trailer elif self._row_mode: # if we have sent data already, we have sent the signature # too (either with the first row, or we assume that in # block mode the signature is included). # Write the trailer only if we are sending rows (with the # assumption that who is copying binary data is sending the # whole format). self._write_buffer += _binary_trailer buffer, self._write_buffer = self._write_buffer, bytearray() return buffer def _ensure_bytes(self, data: Union[Buffer, str]) -> Buffer: if isinstance(data, str): raise TypeError("cannot copy str data in binary mode: use bytes instead") else: # Assume, for simplicity, that the user is not passing stupid # things to the write function. If that's the case, things # will fail downstream. return data def _format_row_text( row: Sequence[Any], tx: Transformer, out: Optional[bytearray] = None ) -> bytearray: """Convert a row of objects to the data to send for copy.""" if out is None: out = bytearray() if not row: out += b"\n" return out for item in row: if item is not None: dumper = tx.get_dumper(item, PY_TEXT) b = dumper.dump(item) out += _dump_re.sub(_dump_sub, b) else: out += rb"\N" out += b"\t" out[-1:] = b"\n" return out def _format_row_binary( row: Sequence[Any], tx: Transformer, out: Optional[bytearray] = None ) -> bytearray: """Convert a row of objects to the data to send for binary copy.""" if out is None: out = bytearray() out += _pack_int2(len(row)) adapted = tx.dump_sequence(row, [PY_BINARY] * len(row)) for b in adapted: if b is not None: out += _pack_int4(len(b)) out += b else: out += _binary_null return out def _parse_row_text(data: Buffer, tx: Transformer) -> Tuple[Any, ...]: if not isinstance(data, bytes): data = bytes(data) fields = data.split(b"\t") fields[-1] = fields[-1][:-1] # drop \n row = [None if f == b"\\N" else _load_re.sub(_load_sub, f) for f in fields] return tx.load_sequence(row) def _parse_row_binary(data: Buffer, tx: Transformer) -> Tuple[Any, ...]: row: List[Optional[Buffer]] = [] nfields = _unpack_int2(data, 0)[0] pos = 2 for i in range(nfields): length = _unpack_int4(data, pos)[0] pos += 4 if length >= 0: row.append(data[pos : pos + length]) pos += length else: row.append(None) return tx.load_sequence(row) _pack_int2 = struct.Struct("!h").pack _pack_int4 = struct.Struct("!i").pack _unpack_int2 = struct.Struct("!h").unpack_from _unpack_int4 = struct.Struct("!i").unpack_from _binary_signature = ( b"PGCOPY\n\xff\r\n\0" # Signature b"\x00\x00\x00\x00" # flags b"\x00\x00\x00\x00" # extra length ) _binary_trailer = b"\xff\xff" _binary_null = b"\xff\xff\xff\xff" _dump_re = re.compile(b"[\b\t\n\v\f\r\\\\]") _dump_repl = { b"\b": b"\\b", b"\t": b"\\t", b"\n": b"\\n", b"\v": b"\\v", b"\f": b"\\f", b"\r": b"\\r", b"\\": b"\\\\", } def _dump_sub(m: Match[bytes], __map: Dict[bytes, bytes] = _dump_repl) -> bytes: return __map[m.group(0)] _load_re = re.compile(b"\\\\[btnvfr\\\\]") _load_repl = {v: k for k, v in _dump_repl.items()} def _load_sub(m: Match[bytes], __map: Dict[bytes, bytes] = _load_repl) -> bytes: return __map[m.group(0)] # Override functions with fast versions if available if _psycopg: format_row_text = _psycopg.format_row_text format_row_binary = _psycopg.format_row_binary parse_row_text = _psycopg.parse_row_text parse_row_binary = _psycopg.parse_row_binary else: format_row_text = _format_row_text format_row_binary = _format_row_binary parse_row_text = _parse_row_text parse_row_binary = _parse_row_binary