File Handling II¶

More snippets on handling files using Free Pascal.

Listing specific columns in a CSV file - TCSVDataSet¶

Here is an example of reading first two columns in a CSV file using TCSVDataset.

program TCSVDatasetGetSpecificCols;

{
 An example of listing the content of first two columns in a CSV file
 using TCSVDataset.
}

{$mode objfpc}{$H+}{$J-}

uses
  {$IFDEF UNIX}
  cmem, cthreads,
  {$ENDIF}
  Classes,
  SysUtils,
  streamex,
  bufstream,
  csvdataset;

  // A routine to list first two columns of a CSV file
  procedure ReadCSV(filename: string;
                    delimiter: char = ',';
                    isFirstRowFieldName: boolean = False);
  var
    fileStream: TFileStream;
    buffStream: TReadBufStream;
    csvDataset: TCSVDataset;
    lineNo: int64;
  begin
    fileStream := TFileStream.Create(filename, fmOpenRead);
    try
      buffStream := TReadBufStream.Create(fileStream, 65536);
      try
        csvDataset := TCSVDataset.Create(nil);
        try

          // Assign a valid delimiter
          csvDataset.CSVOptions.Delimiter := delimiter;

          // Is the first line field names?
          // If yes, first row will be excluded when listing rows
          csvDataset.CSVOptions.FirstLineAsFieldNames := isFirstRowFieldName;

          // Load CSV from the stream
          csvDataset.LoadFromCSVStream(buffStream);

          // Move to first record
          csvDataset.First;

          lineNo := 1;

          while not csvDataset.EOF do
          begin
            // Get the values of the first two fields here and list them.
            WriteLn(Format('row %d: %s, %s',
                           [lineNo,
                            csvDataset.Fields[0].AsString,
                            csvDataset.Fields[1].AsString]));

            // Move to next
            csvDataset.Next;

            // Increment line no
            lineNo := lineNo + 1;
          end;

        finally
          csvDataset.Free;
        end;
      finally
        buffStream.Free;
      end;
    finally
    end;
    fileStream.Free;
  end;

var
  filename: string;

begin

  filename := ParamStr(1);
  if not FileExists(filename) then
  begin
    WriteLn('Cannot find file.');
    Exit;
  end;

  ReadCSV(filename, ';', False);
end.

Listing specific columns in a CSV file - TCSVDocument¶

Here is an example of reading first two columns in a CSV file using TCSVDocument.

program TCSVDocumentGetSpecificCols;


{
 An example of listing the content of first two columns in a CSV file
 using TCSVDocument.
}

{$mode objfpc}{$H+}

uses
  {$IFDEF UNIX}
  cmem, cthreads,
  {$ENDIF}
  Classes,
  SysUtils,
  csvdocument,
  streamex,
  bufstream;

  procedure ReadCSV(filename: string; delimiter: char);
  var
    fileStream: TFileStream;
    buffStream: TReadBufStream;
    csvReader: TCSVDocument;
    index, totalLines: int64;
  begin
    totalLines := 0;
    fileStream := TFileStream.Create(filename, fmOpenRead);
    try
      buffStream := TReadBufStream.Create(fileStream, 65536);
      try
        csvReader := TCSVDocument.Create;
        try
          // Assign a delimiter
          csvReader.Delimiter := delimiter;

          // Assign a source stream.
          csvReader.LoadFromStream(buffStream, 65536);

          // Get total lines for iteration.
          totalLines := csvReader.RowCount;

          // Print the values of first two columns from the CSV file.
          for index := 0 to totalLines-1 do
          begin
            WriteLn(Format('row %d: %s, %s', [(index + 1),
                                              csvReader.Cells[0, index],
                                              csvReader.Cells[1, index]]));
          end;

        finally
          csvReader.Free;
        end;
      finally
        buffStream.Free;
      end;
    finally
    end;
    fileStream.Free;
  end;

var
  filename: string;

begin
  filename := ParamStr(1);
  if not FileExists(filename) then
  begin
    WriteLn('Cannot find file.');
    Exit;
  end;

  ReadCSV(filename, ';');
end.

Split text file into chunks of 1 Mbytes - TFileStream¶

This snippet splits a large text file into smaller chunks without breaking lines or paragraphs. Here's a summary of the program's functionality:

Procedure SaveChunkToFile: Handles creating new files for each chunk and writing the chunk data to these files.
Constants and Variables:
- defaultChunkSize: The size of each chunk, set to 1 MB.
- fileStream: Used to read from the input file.
- buffer: A memory buffer to hold chunk data.
- bytesRead, totalBytesRead, chunkSize, lineBreakPos, chunkIndex: Variables to track file reading and processing.
Main Logic:
- The program checks if the input file exists.
- It reads the file in chunks, finds the last newline in each chunk, adjusts the file pointer if necessary, and writes each chunk to a new file.
- The loop continues until the entire text file is processed.

program TFileStreamSplitFile;

{
 This program splits a text file based on a chunkSize.
 The algorithm ensures it won't split the text in the middle of a line/paragraph.

 1. Open the file and allocate memory bufers for reading chunks of data.

 2. Read the file in chunks and locate the last `\n` character in the chunk.
    Once it locates the last `\n` in the chunk, move the file pointer back to include
    that byte and any preceding bytes of the partial line in the next chunk's read operation.

 3. Repeat - read and parse the remainder.

 4. Once parsing is complete, close the file and free any allocated memory (to prevent memory leaks).
}

{$mode objfpc}{$H+}{$J-}

uses
  {$IFDEF UNIX}
  cmem, cthreads,
  {$ENDIF}
  SysUtils,
  Classes,
  bufstream;

  procedure SaveChunkToFile(const filename: string; const chunkData: pansichar;
  const dataSize: integer; const chunkIndex: integer);
  var
    chunkFile: TFileStream;
  begin
    // Create a new file for the chunk
    chunkFile := TFileStream.Create(filename + '-chunk-' + IntToStr(ChunkIndex) +
      '.txt', fmCreate);
    try
      // Write the chunk data to the chunk file
      chunkFile.WriteBuffer(chunkData^, dataSize);
    finally
      chunkFile.Free;
    end;
  end;

const
  defaultChunkSize: integer = 1048576; // 1 MB in bytes

var
  fileStream: TFileStream;
  buffer: pansichar;
  bytesRead, totalBytesRead, chunkSize, lineBreakPos, chunkIndex: int64;

begin

  if not FileExists(ParamStr(1)) then
  begin
    WriteLn('Please spefcify a valid text file.');
    Exit;
  end;

  chunkSize := defaultChunkSize * 1;

  // Open the file for reading
  fileStream := TFileStream.Create(ParamStr(1), fmOpenRead);
  try
    // Allocate memory buffer for reading chunks
    // Ref: https://www.freepascal.org/docs-html/rtl/system/getmem.html
    GetMem(buffer, chunkSize);
    try
      totalBytesRead := 0;
      chunkIndex := 0;

      // Read and parse chunks of data until EOF
      while totalBytesRead < fileStream.Size do
      begin
        bytesRead := fileStream.Read(buffer^, chunkSize);
        Inc(totalBytesRead, bytesRead);

        // Find the position of the last newline character in the chunk
        lineBreakPos := BytesRead;
        while (lineBreakPos > 0) and (Buffer[lineBreakPos - 1] <> #10) do
          Dec(lineBreakPos);

        { Now, must ensure that if the last byte read in the current chunk
          is not a newline character, the file pointer is moved back to include
          that byte and any preceding bytes of the partial line in the next
          chunk's read operation.

          Also, no need to update the BytesRead variable in this context because
          it represents the actual number of bytes read from the file, including
          any partial line that may have been included due to moving the file
          pointer back.
          Ref: https://www.freepascal.org/docs-html/rtl/classes/tstream.seek.html}
        if lineBreakPos < bytesRead then
          fileStream.Seek(-(bytesRead - lineBreakPos), soCurrent);

        // Write the chunk data to a file using the separate procedure
        SaveChunkToFile('output', buffer, lineBreakPos, chunkIndex);

        // Display user feedback
        WriteLn('Chunk ', chunkIndex, ', Total bytes read:', IntToStr(totalBytesRead));

        // Increase chunk index - a counter
        Inc(chunkIndex);
      end;
    finally
      // Free the memory buffer
      FreeMem(buffer);
    end;
  finally
    // Close the file
    fileStream.Free;
  end;
end.