[projects/git-slug] Automatyczny retry przy przejściowych błędach sieciowych

arekm arekm at pld-linux.org
Tue May 12 15:09:05 CEST 2026


commit d85f1a2bf8557e06405515e08d6e9c6eae1a3740
Author: Arkadiusz Miśkiewicz <arekm at maven.pl>
Date:   Thu Apr 9 23:36:06 2026 +0200

    Automatyczny retry przy przejściowych błędach sieciowych
    
    git_passthrough_worker ponawia komendę do 3 razy (konfigurowalny
    przez PLD.retries) z rosnącym opóźnieniem (5s, 10s, 20s) gdy git
    zgłosi znany błąd przejściowy (connection reset, timeout, DNS, itp.)
    zamiast od razu raportować porażkę.
    
    Komunikaty retry pokazują numer próby: retry [1/3] in 5s...,
    retry [1/3] succeeded / retry [3/3] failed, giving up.

 slug.py                   | 61 +++++++++++++++++++++++++++++++++++++++++------
 tests/conftest.py         |  1 +
 tests/test_passthrough.py |  2 ++
 3 files changed, 57 insertions(+), 7 deletions(-)
---
diff --git a/slug.py b/slug.py
index 29c1fcd..60419cf 100755
--- a/slug.py
+++ b/slug.py
@@ -302,6 +302,7 @@ def apply_defaults(options):
     CONFIG_KEYS = {
         'packagesdir': ('PLD.packagesdir', str),
         'jobs': ('PLD.jobs', int),
+        'retries': ('PLD.retries', int),
     }
 
     # Layer 2: fill from git config if CLI didn't set the value
@@ -320,6 +321,7 @@ def apply_defaults(options):
         # workers = more parallelism.  Capped at 32 to stay within typical
         # SSH MaxStartups limits on git servers.
         'jobs': lambda: min(cpu_count() * 4, 32),
+        'retries': lambda: 3,
         'quiet': lambda: False,
         'pattern': lambda: ['*'],
     }
@@ -454,24 +456,69 @@ def build_git_cmd(repo_dir, git_command, git_args, config_pairs):
     return cmd
 
 
-def git_passthrough_worker(repo_dir, git_command, git_args, config_pairs, quiet):
+_GIT_TRANSIENT_ERRORS = (
+    'Connection reset by peer',
+    'Connection timed out',
+    'Could not resolve hostname',
+    'Unable to look up',
+    'The remote end hung up unexpectedly',
+    'early EOF',
+    'SSL_read:',
+)
+
+# Seconds to wait between retries (doubles each attempt)
+_GIT_RETRY_DELAY = 5
+
+
+def _is_transient_error(stderr_str):
+    """Check if stderr contains a known transient network error."""
+    return any(msg in stderr_str for msg in _GIT_TRANSIENT_ERRORS)
+
+
+def git_passthrough_worker(repo_dir, git_command, git_args, config_pairs, quiet, retries):
     """Run a git command in one repo, capture and prefix output.
 
     Called by the worker pool for each repo in parallel.
     Returns repo_dir on failure (so the caller can count failures), None on success.
 
+    Retries up to 'retries' times on transient network errors
+    (connection reset, timeout, etc.) with exponential backoff.
+
     Output policy:
       - Failed repos: always print everything (stdout + stderr) regardless of -q.
       - Successful repos, stderr: always print (may contain warnings).
       - Successful repos, stdout: print unless -q (quiet suppresses data output).
     """
+    import time
     directory = os.path.basename(repo_dir)
     cmd = build_git_cmd(repo_dir, git_command, git_args, config_pairs)
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                            preexec_fn=_reset_sigint)
-    stdout_bytes, stderr_bytes = proc.communicate()
-    stdout_str = stdout_bytes.decode('utf-8', errors='replace')
-    stderr_str = stderr_bytes.decode('utf-8', errors='replace')
+
+    delay = _GIT_RETRY_DELAY
+    for attempt in range(retries + 1):
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                preexec_fn=_reset_sigint)
+        stdout_bytes, stderr_bytes = proc.communicate()
+        stdout_str = stdout_bytes.decode('utf-8', errors='replace')
+        stderr_str = stderr_bytes.decode('utf-8', errors='replace')
+
+        if proc.returncode == 0 or not _is_transient_error(stderr_str):
+            if attempt > 0 and proc.returncode == 0:
+                print_prefixed('retry [{}/{}] succeeded'.format(
+                                   attempt, retries),
+                               directory, sys.stderr)
+            break
+
+        # Transient error — retry after backoff
+        if attempt < retries:
+            print_prefixed('transient error, retry [{}/{}] in {}s...'.format(
+                               attempt + 1, retries, delay),
+                           directory, sys.stderr)
+            time.sleep(delay)
+            delay *= 2
+        else:
+            print_prefixed('retry [{}/{}] failed, giving up'.format(
+                               attempt, retries),
+                           directory, sys.stderr)
 
     # Always show stderr (may contain warnings even on success).
     print_prefixed(stderr_str, directory, sys.stderr)
@@ -506,7 +553,7 @@ def passthrough_command(options, git_command, git_args):
               file=sys.stderr)
 
     # Build argument tuples for pool.starmap()
-    args = [(r, git_command, git_args, config_pairs, options.quiet)
+    args = [(r, git_command, git_args, config_pairs, options.quiet, options.retries)
             for r in repos]
     failed = run_worker(git_passthrough_worker, options, args)
 
diff --git a/tests/conftest.py b/tests/conftest.py
index c9545e3..d461f43 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,6 +10,7 @@ def make_options():
             "packagesdir": "/pkgs",
             "jobs": 2,
             "quiet": False,
+            "retries": 3,
             "pattern": ["*"],
             "had_errors": False,
         }
diff --git a/tests/test_passthrough.py b/tests/test_passthrough.py
index 658e8e3..597b8bd 100644
--- a/tests/test_passthrough.py
+++ b/tests/test_passthrough.py
@@ -48,6 +48,7 @@ def test_git_passthrough_worker_prints_all_output_for_failures(monkeypatch, caps
         [],
         ["pull.rebase=true"],
         quiet=True,
+        retries=0,
     )
 
     captured = capsys.readouterr()
@@ -69,6 +70,7 @@ def test_git_passthrough_worker_quiet_suppresses_success_stdout_only(monkeypatch
         [],
         [],
         quiet=True,
+        retries=0,
     )
 
     captured = capsys.readouterr()
================================================================

---- gitweb:

http://git.pld-linux.org/gitweb.cgi/projects/git-slug.git/commitdiff/4a7e426b8f1a3571094b5dc89412bc49b8f29666



More information about the pld-cvs-commit mailing list