[IMP] orm: add optional parallelism to iter_browse.create()

cawo-odoo · cawo-odoo · commit 94f0b0b61e93 · 2025-11-10T13:44:46.000Z
Like the same support added to `__attr__` in the parent commit, this can only
be used by callers when it is known that database modifications will be
distinct, not causing concurrency issues or side-effects on the results.

`create` returns an `iter_browse` object for the caller to browse created
records. With the multiprocessing strategy, we make the following changes to
it:
- To support vast amounts of created records in multiprocessing strategy, we
  process values in a generator and initialize the returned `iter_browse`
  object with it. As this requires the caller of `create` to always
  consume/iterate the result (otherwise records will not be created), it is not
  applied to the other strategies as it would break existing API.
- make __iter__ yield chunks if strategy is multiprocessing. This way, a caller
  can process chunks of freshly created records `for records in
  util.iter_browse(strategy="multiprocessing").create(SQLStr)` and since
  everything from input to output is a generator, will be perfectly memory
  efficient.
- do not pass the logger to the returned `iter_browse` object from `create`, if
  the strategy is multiprocessing, because it will lead to interleaved logging
  from the input generator and this one when the caller iterates it.
diff --git a/src/util/orm.py b/src/util/orm.py
@@ -367,7 +367,12 @@ def _mp_iter_browse_cb(ids_or_values, params):
         getattr(
             me.env[params["model_name"]].with_context(params["context"]).browse(ids_or_values), params["attr_name"]
         )(*params["args"], **params["kwargs"])
+    if params["mode"] == "create":
+        new_ids = me.env[params["model_name"]].with_context(params["context"]).create(ids_or_values).ids
     me.env.cr.commit()
+    if params["mode"] == "create":
+        return new_ids
+    return None
 
 
 class iter_browse(object):
@@ -543,8 +548,12 @@ def __iter__(self):
             raise RuntimeError("%r ran twice" % (self,))
 
         it = chain.from_iterable(self._it)
+        sz = self._size
+        if self._strategy == "multiprocessing":
+            it = self._it
+            sz = (self._size + self._chunk_size - 1) // self._chunk_size
         if self._logger:
-            it = log_progress(it, self._logger, qualifier=self._model._name, size=self._size)
+            it = log_progress(it, self._logger, qualifier=self._model._name, size=sz)
         self._it = None
         return chain(it, self._end())
 
@@ -626,6 +635,12 @@ def create(self, values=None, query=None, **kw):
             except TypeError:
                 raise ValueError("When passing a generator of values, the size kwarg is mandatory")
 
+        if self._strategy == "multiprocessing":
+            return self._create_multiprocess(values, size, multi)
+
+        return self._create(values, size, multi)
+
+    def _create(self, values, size, multi):
         it = chunks(values, self._chunk_size, fmt=list)
         if self._logger:
             sz = (size + self._chunk_size - 1) // self._chunk_size
@@ -651,6 +666,42 @@ def create(self, values=None, query=None, **kw):
             self._model, *args, chunk_size=self._chunk_size, logger=self._logger, strategy=self._strategy
         )
 
+    def _create_multiprocess(self, values, size, multi):
+        if not multi:
+            raise ValueError("The multiprocessing strategy only supports the multi version of `create`")
+
+        it = chunks(values, self._superchunk_size, fmt=list)
+        if self._logger:
+            sz = (size + self._superchunk_size - 1) // self._superchunk_size
+            qualifier = "env[%r].create([:%d])" % (self._model._name, self._superchunk_size)
+            it = log_progress(it, self._logger, qualifier=qualifier, size=sz)
+
+        def iter_proc():
+            params = {
+                "dbname": self._model.env.cr.dbname,
+                "model_name": self._model._name,
+                # convert to dict for pickle. Will still break if any value in the context is not pickleable
+                "context": dict(self._model.env.context),
+                "mode": "create",
+            }
+            self._model.env.cr.commit()
+            self._patch.start()
+            extrakwargs = {"mp_context": multiprocessing.get_context("fork")} if sys.version_info >= (3, 7) else {}
+            with ProcessPoolExecutor(max_workers=get_max_workers(), **extrakwargs) as executor:
+                for sub_values in it:
+                    for task_result in executor.map(
+                        _mp_iter_browse_cb, chunks(sub_values, self._chunk_size, fmt=tuple), repeat(params)
+                    ):
+                        self._model.env.cr.commit()  # make task_result visible on main cursor before yielding ids
+                        for new_id in task_result:
+                            yield new_id
+            next(self._end(), None)
+
+        self._patch = no_selection_cache_validation()
+        args = self._cr_uid + (iter_proc(),)
+        kwargs = {"size": size, "chunk_size": self._chunk_size, "logger": None, "strategy": self._strategy}
+        return iter_browse(self._model, *args, **kwargs)
+
 
 @contextmanager
 def custom_module_field_as_manual(env, rollback=True, do_flush=False):