Large Import Errors Script (split_csv_by_rows.py)

LostAndFound · August 29, 2025, 10:45pm

Hi All,

This might come in handy, if you are getting errors for a large import of passwords using kdbx format or any other formats.

Export all the passwords into csv format (csv keepass). Then use the following python 3 script to chunk them

split_csv_by_rows.py

#!/usr/bin/env python3
"""
split_csv_by_rows.py

Split a CSV into multiple files, each with a fixed number of rows (default 200).
Preserves header in each output file and never breaks a record across chunks.
"""

import argparse
import csv
import os
import sys
import gzip

def open_output(path, gzip_out=False):
    if gzip_out:
        return gzip.open(path, "wt", encoding="utf-8", newline='')
    return open(path, "w", encoding="utf-8", newline='')

def split_by_rows(input_path, outdir, prefix, rows_per_file, gzip_out=False, quoteall=False):
    quoting = csv.QUOTE_ALL if quoteall else csv.QUOTE_MINIMAL
    os.makedirs(outdir, exist_ok=True)

    with open(input_path, 'r', encoding='utf-8-sig', newline='') as infile:
        reader = csv.reader(infile)
        try:
            header = next(reader)
        except StopIteration:
            print("Empty file.")
            return

        file_index = 1
        row_count = 0
        outfile = None
        writer = None

        for row in reader:
            if row_count % rows_per_file == 0:
                # close previous file
                if outfile:
                    outfile.close()
                # open new file
                out_name = f"{prefix}_{file_index}.csv"
                if gzip_out:
                    out_name += ".gz"
                out_path = os.path.join(outdir, out_name)
                outfile = open_output(out_path, gzip_out)
                writer = csv.writer(outfile, quoting=quoting)
                writer.writerow(header)  # always write header
                print(f"Writing {out_path} ...")
                file_index += 1
            writer.writerow(row)
            row_count += 1

        if outfile:
            outfile.close()
    print("Done.")

def main():
    parser = argparse.ArgumentParser(description="Split CSV into chunks of fixed row count (default: 200 rows per file).")
    parser.add_argument("input", help="Input CSV file")
    parser.add_argument("--rows", "-r", type=int, default=200,
                        help="Number of data rows per output file (default: 200)")
    parser.add_argument("--outdir", "-o", default=".",
                        help="Output directory (default: current dir)")
    parser.add_argument("--prefix", "-p", default="part",
                        help="Output file prefix (default: part)")
    parser.add_argument("--gzip", action="store_true",
                        help="Compress output files with gzip (.gz)")
    parser.add_argument("--quoteall", action="store_true",
                        help="Force quoting all fields in output")
    args = parser.parse_args()

    if not os.path.isfile(args.input):
        print("ERROR: input file not found:", args.input, file=sys.stderr)
        sys.exit(1)

    split_by_rows(args.input, args.outdir, args.prefix, args.rows,
                  gzip_out=args.gzip, quoteall=args.quoteall)

if __name__ == "__main__":
    main()

Usage: (default rows 200)

python3 split_csv_by_rows.py passbolt-export-xxxxx.csv --outdir chunks --prefix upload

Custom rows:

python3 split_csv_by_rows.py passbolt-export-xxxxx.csv --rows 100 --outdir chunks --prefix upload

Available arguments:

--rows, -r              Number of data rows per output file (default: 200)

--outdir, -o            Output directory (default: current dir)

--prefix, -p            Output file prefix (default: part)

--gzip                  Compress output files with gzip (.gz)

--quoteall              Force quoting all fields in output

You’ll thank me later.

Enjoy

LostAndFound · August 29, 2025, 10:58pm

PHP Version: Converted by chat gpt, updated and tested by me. It works but i would use the python version.

#!/usr/bin/env php
<?php
/**
 * split_csv_by_rows.php
 *
 * Split a CSV file into multiple smaller files, each containing a fixed number of rows.
 * - Default: 200 rows per file
 * - Preserves header in each output file
 * - Handles embedded newlines inside quoted fields
 * - Optional gzip compression
 *
 * Usage:
 *   php split_csv_by_rows.php input.csv --rows=200 --outdir=chunks --prefix=part --gzip --quoteall
 */

function fputcsv_custom($handle, array $fields, string $delimiter = ",", string $enclosure = "\"", string $escape_char = "\\", bool $quoteAll = false): void {
    if ($quoteAll) {
        $escaped = [];
        foreach ($fields as $field) {
            $escaped[] = $enclosure . str_replace($enclosure, $enclosure . $enclosure, $field) . $enclosure;
        }
        fwrite($handle, implode($delimiter, $escaped) . "\n");
    } else {
        fputcsv($handle, $fields, $delimiter, $enclosure, $escape_char);
    }
}

function split_by_rows(string $inputPath, string $outdir, string $prefix, int $rowsPerFile, bool $gzipOut = false, bool $quoteAll = false): void {
    if (!file_exists($inputPath)) {
        fwrite(STDERR, "ERROR: input file not found: $inputPath\n");
        exit(1);
    }

    if (!is_dir($outdir)) {
        if (!mkdir($outdir, 0777, true) && !is_dir($outdir)) {
            fwrite(STDERR, "ERROR: cannot create output directory: $outdir\n");
            exit(1);
        }
    }

    $in = fopen($inputPath, 'r');
    if ($in === false) {
        fwrite(STDERR, "ERROR: cannot open input file: $inputPath\n");
        exit(1);
    }

    $header = fgetcsv($in, 0, ",", "\"", "\\");
    if ($header === false) {
        echo "Empty file.\n";
        fclose($in);
        return;
    }

    $fileIndex = 1;
    $rowCount = 0;
    $out = null;

    while (($row = fgetcsv($in, 0, ",", "\"", "\\")) !== false) {
        if ($rowCount % $rowsPerFile === 0) {
            if ($out) {
                fclose($out);
            }
            $outName = sprintf("%s_%d.csv", $prefix, $fileIndex);
            if ($gzipOut) {
                $outName .= ".gz";
            }
            $outPath = rtrim($outdir, DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . $outName;
            $out = $gzipOut ? gzopen($outPath, 'w') : fopen($outPath, 'w');
            if ($out === false) {
                fwrite(STDERR, "ERROR: cannot create output file: $outPath\n");
                fclose($in);
                exit(1);
            }
            echo "Writing $outPath ...\n";
            if ($gzipOut) {
                gzwrite($out, implode(",", array_map(fn($f) => "\"".str_replace("\"", "\"\"", $f)."\"", $header)) . "\n");
            } else {
                fputcsv_custom($out, $header, ",", "\"", "\\", $quoteAll);
            }
            $fileIndex++;
        }
        if ($gzipOut) {
            gzwrite($out, implode(",", array_map(fn($f) => "\"".str_replace("\"", "\"\"", $f)."\"", $row)) . "\n");
        } else {
            fputcsv_custom($out, $row, ",", "\"", "\\", $quoteAll);
        }
        $rowCount++;
    }

    if ($out) {
        fclose($out);
    }
    fclose($in);

    echo "Done.\n";
}

/**
 * Parse CLI args (supports both --key=value and --key value).
 */
function parse_args(array $argv): array {
    $args = [
        'rows' => 200,
        'outdir' => '.',
        'prefix' => 'part',
        'gzip' => false,
        'quoteall' => false,
        'input' => null,
    ];

    foreach ($argv as $i => $arg) {
        if ($i === 0) continue; // script name

        if (!str_starts_with($arg, "--")) {
            // first non-option is input file
            if ($args['input'] === null) {
                $args['input'] = $arg;
            }
            continue;
        }

        // Handle --key=value
        if (strpos($arg, "=") !== false) {
            [$key, $val] = explode("=", substr($arg, 2), 2);
        } else {
            $key = substr($arg, 2);
            $val = true;
        }

        switch ($key) {
            case 'rows':     $args['rows'] = (int)$val; break;
            case 'outdir':   $args['outdir'] = (string)$val; break;
            case 'prefix':   $args['prefix'] = (string)$val; break;
            case 'gzip':     $args['gzip'] = true; break;
            case 'quoteall': $args['quoteall'] = true; break;
        }
    }

    return $args;
}

$args = parse_args($argv);

if ($args['input'] === null) {
    fwrite(STDERR, "Usage: php split_csv_by_rows.php input.csv [--rows=200] [--outdir=chunks] [--prefix=part] [--gzip] [--quoteall]\n");
    exit(1);
}

split_by_rows($args['input'], $args['outdir'], $args['prefix'], $args['rows'], $args['gzip'], $args['quoteall']);