cassandra-consistency-script/cassandra_consistency_script.php

1310 lines
48 KiB
PHP
Raw Normal View History

2023-05-19 11:58:21 +02:00
<?php
2023-05-29 08:23:39 +02:00
/**
* Class DataConsistencyChecker
*
* The DataConsistencyChecker class is responsible for checking the consistency of data.
*/
2023-05-19 11:58:21 +02:00
class DataConsistencyChecker
{
private $_cluster;
private $session;
private $cassandra;
private $directory;
private $structured_directory;
private $retrived_csv;
2023-05-29 08:23:39 +02:00
private static $clientId;
2023-05-19 11:58:21 +02:00
private static $schemaVersion = 1;
2023-05-29 08:23:39 +02:00
private static $bucketMagic = 4;
private const DEFAULT_PAGE_SIZE = 30;
private const CASSANDRA_RESULT_CSV = 'result_from_cassandra_entries.csv';
private const CASSANDRA_HTML_REPORT = 'cassandra.html';
private const PHYSICAL_RESULT_CSV = 'result_from_physical_files.csv';
private const PHYSICAL_HTML_REPORT = 'physical.html';
private const CSV_COLUMN_FILE_ATTACHMENT = 'File/Attachment';
private const CSV_COLUMN_FILE_PATH = 'File path';
private const CSV_COLUMN_FILE_NAME = 'File name';
private const CSV_COLUMN_THUMB1 = 'Thumb 1';
private const CSV_COLUMN_THUMB2 = 'Thumb 2';
private const CSV_COLUMN_SIZE = 'Size';
private const CSV_COLUMN_CREATION_TIME = 'Creation Time';
private const CSV_COLUMN_CLIENT_ID = 'ClientId';
private const CSV_COLUMN_BUCKET = 'Bucket';
private const CSV_COLUMN_ID = 'Id';
private const CSV_EXTENSION = '.csv';
private const NUM_FIELDS = 3;
private const DELIMITER = '-';
2023-05-19 11:58:21 +02:00
public function __construct($directory = null)
{
if ($directory !== null) {
$this->directory = $directory;
}
$this->runFromCommandLine($_SERVER['argv']);
}
/**
* Initializes the Cassandra connection based on the configuration settings.
*
* @return void
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
public function init(): void
{
2023-05-29 08:23:39 +02:00
$config = parse_ini_file('config.ini', true);
$cassandraConfig = $config['CASSANDRA'];
2023-05-19 11:58:21 +02:00
$this->_cluster = Cassandra::cluster()
2023-05-29 08:23:39 +02:00
->withContactPoints($cassandraConfig['host'])
2023-05-30 18:39:44 +02:00
->withPort((int)$cassandraConfig['port'])
->withCredentials(
2023-05-29 08:23:39 +02:00
$cassandraConfig['user'],
$cassandraConfig['password']
)
2023-05-19 11:58:21 +02:00
->build();
if ($this->_cluster) {
try {
2023-05-29 08:23:39 +02:00
$this->cassandra = $this->_cluster->connect($cassandraConfig['keyspace']);
2023-05-19 11:58:21 +02:00
} catch (Exception $e) {
2023-05-29 08:23:39 +02:00
echo "An error occurred: " . $e->getMessage() . "\n";
2023-05-19 11:58:21 +02:00
}
}
}
/**
* Runs the script from the command line with the provided arguments.
*
* @param array $arguments The command line arguments.
* @return void
*/
2023-05-29 08:23:39 +02:00
public function runFromCommandLine(array $arguments): void
{
$shortOptions = "hd:v:o:r:s:";
$longOptions = ["help", "directory:", "version:", "v", "output:", "o", "remove:", "r", "source:", "s"];
$options = getopt($shortOptions, $longOptions);
if (count($options) === 0 || isset($options['h']) || isset($options['help'])) {
$this->displayHelpMessage();
exit;
}
$directory = $options['directory'] ?? $options['d'] ?? null;
$schemaVersion = $options['version'] ?? $options['v'] ?? null;
$source = $options['source'] ?? $options['s'] ?? null;
$remove = $options['remove'] ?? $options['r'] ?? null;
$structured_directory = $options['output'] ?? $options['o'] ?? null;
if (!file_exists($structured_directory)) {
2023-05-30 18:39:44 +02:00
if ($structured_directory === null && $source === null) {
2023-05-29 08:23:39 +02:00
echo "Output directory is required. Please specify the --output option.\n";
exit;
}
2023-05-30 18:39:44 +02:00
if ($structured_directory !== null) {
mkdir($structured_directory, 0777, true);
}
2023-05-29 08:23:39 +02:00
}
$this->structured_directory = $structured_directory;
if (($directory === null || $schemaVersion === null) && $remove === null && $source === null) {
echo "Missing Attachment directory or schema version.\n";
exit;
}
if ($schemaVersion && !in_array($schemaVersion, [1, 2])) {
echo "Invalid schema version. Only versions 1 and 2 are supported.\n";
exit;
}
2023-05-30 18:39:44 +02:00
if ($structured_directory === null && $source === null) {
2023-05-29 08:23:39 +02:00
echo "Output directory is required. Please specify the --output option.\n";
2023-05-19 11:58:21 +02:00
exit;
2023-05-29 08:23:39 +02:00
}
static::$schemaVersion = (int) $schemaVersion;
$this->directory = $directory;
$this->retrived_csv = './result_from_physical_files.csv';
if ($remove && $source) {
$this->processAttachmentDeletionCSV($remove, $source);
} else {
$this->checkConsistency('attachment_file_info');
2023-05-30 18:39:44 +02:00
2023-05-29 08:23:39 +02:00
if (is_dir($this->structured_directory)) {
$this->removeDirectory($this->structured_directory);
}
}
exit;
}
2023-05-19 11:58:21 +02:00
/**
* Displays the help message with instructions on how to use the script.
*
* @return void
*/
2023-05-29 08:23:39 +02:00
private function displayHelpMessage(): void
2023-05-19 11:58:21 +02:00
{
$helpMessage = <<<EOT
Usage:
php script.php [options]
Options:
-h, --help Display this help screen.
2023-05-30 18:39:44 +02:00
--version Set the schema version (default: v1).
2023-05-19 11:58:21 +02:00
--directory Set the directory path for attachments.
2023-05-30 18:39:44 +02:00
--output Set the folder for temp files - note that this folder will be deleted after script completes
--remove We need to choose between file and cassandra what we want to remove
2023-05-19 11:58:21 +02:00
Example:
php script_name --version schema_version --directory=/path/to/directory --output ./out/
For Delete:
php script_name --remove result_from_cassandra_entries.csv --source file - to remove missing physical files
2023-05-30 18:39:44 +02:00
php script_name --remove result_from_physical_files.csv --source cassandra --version 2 - to remove missing cassandra entries
2023-05-19 11:58:21 +02:00
EOT;
echo $helpMessage;
}
/**
* Recursively removes a directory and its contents.
*
* @param string $directory The directory path to be removed.
* @return void
*/
2023-05-29 08:23:39 +02:00
private function removeDirectory(string $directory): void
2023-05-19 11:58:21 +02:00
{
if (!is_dir($directory)) {
return;
}
2023-05-29 08:23:39 +02:00
$files = array_diff(
scandir($directory),
['.', '..']
);
foreach ($files as $file) {
2023-05-30 18:39:44 +02:00
$path = $directory . DIRECTORY_SEPARATOR . $file;
if (is_dir($path)) {
$this->removeDirectory($path);
} else {
unlink($path);
}
}
rmdir($directory);
}
/**
* Checks the consistency between database entries and file entries.
*
* @param string $tableName The name of the table in the database to check consistency for.
* @return void
*/
2023-05-29 08:23:39 +02:00
public function checkConsistency(string $tableName): void
{
2023-05-19 11:58:21 +02:00
$this->init();
$dbEntries = $this->getDbEntries($tableName);
$fileEntries = $this->getFileEntries($this->directory);
$this->process_files_in_directory($this->structured_directory);
}
/**
* Retrieves file entries from a directory and organizes them based on dynamic values.
*
* @param string $directory The directory path to retrieve file entries from.
* @return array An array containing the file entries organized by dynamic values.
*/
2023-05-29 08:23:39 +02:00
private function getFileEntries(string $directory): array
2023-05-19 11:58:21 +02:00
{
$files = glob($directory . '/*');
$entries = [];
foreach ($files as $file) {
if (is_file($file)) {
$fileName = basename($file);
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
if (strpos($fileName, '-thumb1') !== false || strpos($fileName, '-thumb2') !== false) {
continue;
}
2023-05-30 18:39:44 +02:00
2023-05-29 08:23:39 +02:00
$dashParts = explode(self::DELIMITER, $fileName, 2);
$dotParts = explode('.', $fileName);
2023-05-30 18:39:44 +02:00
2023-05-19 11:58:21 +02:00
if (count($dashParts) === 2) {
$clientId = $dashParts[0];
$id = $dashParts[1];
$fileParts = $dashParts;
2023-05-29 08:23:39 +02:00
$delimiter = self::DELIMITER;
2023-05-19 11:58:21 +02:00
} elseif (count($dotParts) >= 2) {
$clientId = $dotParts[0];
2023-05-29 08:23:39 +02:00
$id = implode('.', array_slice($dotParts, 1));
2023-05-19 11:58:21 +02:00
$fileParts = $dotParts;
$delimiter = '.';
} else {
continue;
}
$filePath = $file;
$size = filesize($filePath);
$creationTime = date('Y-m-d H:i:s', filemtime($filePath));
$dynamicValue = substr($id, 0, 2);
$creationTime = str_replace('"', '', $creationTime);
$entries[$dynamicValue][] = [
'client_id' => $clientId,
'file_path' => $filePath,
'file_name' => implode($delimiter, $fileParts),
'size' => $size,
'creation_time' => $creationTime,
];
}
}
// create CSV file for each client ID's physical file entries
foreach ($entries as $clientId => $clientEntries) {
$this->createPhysicalFileCSV($clientId, $clientEntries);
}
return $entries;
}
/**
* Creates a CSV file containing the physical file entries for a specific client ID.
*
* @param string $clientId The client ID.
* @param array $entries An array containing the physical file entries for the client.
* @return void
*/
2023-05-19 11:58:21 +02:00
2023-05-30 18:39:44 +02:00
private function createPhysicalFileCSV(string $clientId, array $entries): void
{
2023-05-29 08:23:39 +02:00
$fileName = $this->structured_directory . 'physical_' . $clientId . self::CSV_EXTENSION;
2023-05-19 11:58:21 +02:00
$csvFile = fopen($fileName, 'w');
2023-05-30 18:39:44 +02:00
fputcsv($csvFile, ['id', 'size', 'creation_time', 'filename']);
2023-05-19 11:58:21 +02:00
foreach ($entries as $entry) {
2023-05-30 18:39:44 +02:00
$idx = strpos($entry['file_name'],'.');
$id = $idx === false ? $entry['file_name'] : substr($entry['file_name'],0,$idx);
fputcsv($csvFile, [
$id,
$entry['size'],
$entry['creation_time'],
$entry['file_name']
]);
2023-05-19 11:58:21 +02:00
}
fclose($csvFile);
2023-05-30 18:39:44 +02:00
}
2023-05-19 11:58:21 +02:00
/**
* Creates a CSV file containing the entries for a specific client ID.
*
* @param string $clientId The client ID.
* @param array $entries An array containing the entries for the client.
* @return void
*/
2023-05-19 11:58:21 +02:00
2023-05-29 08:23:39 +02:00
private function createDBFileCSV(string $clientId, array $entries): void
2023-05-19 11:58:21 +02:00
{
2023-05-29 08:23:39 +02:00
$fileName = $this->structured_directory . 'cassandra_' . (string) $clientId . self::CSV_EXTENSION;
2023-05-19 11:58:21 +02:00
$csvFile = fopen($fileName, 'w');
$headers = ['id', 'size', 'creation_time', 'filename', 'bucket', 'client_id', 'attachment_id'];
fputcsv($csvFile, $headers);
foreach ($entries as $key => $entry) {
$rowData = [
$entry['id'],
$entry['size'],
$entry['creation_time'],
$entry['filename'],
$entry['bucket'],
$entry['client_id'],
2023-05-19 11:58:21 +02:00
$entry['attachment_id'],
];
fputcsv($csvFile, $rowData);
}
fclose($csvFile);
}
/**
* Retrieves entries from a database table.
*
* @param string $tableName The name of the database table.
* @return array An array containing the retrieved entries.
*/
2023-05-19 11:58:21 +02:00
2023-05-29 08:23:39 +02:00
private function getDbEntries(string $tableName): array
{
2023-05-29 08:23:39 +02:00
$query = sprintf(
'SELECT client_id, id, size, filename, created_on%s FROM %s',
$this->schema_version() === 1 ? '' : ', bucket',
$tableName
);
2023-05-19 11:58:21 +02:00
$arguments = [];
$result = $this->cassandra->execute(
$query,
[
'arguments' => $arguments,
'page_size' => static::DEFAULT_PAGE_SIZE
]
);
$entries = [];
while ($result) {
foreach ($result as $row) {
2023-05-30 18:39:44 +02:00
if ($row['bucket'] === 'x') {
continue; // Skip the current iteration if bucket is 'x'
}
2023-05-19 11:58:21 +02:00
if (preg_match('/^[0-9]+$/', $row['id'])) {
$dotParts = explode(".", $row['filename'], 2);
$dynamicValue = substr($dotParts[1], 0, 2);
$timestamp = (int) $row['created_on'];
$date = date('Y-m-d H:i:s', $timestamp);
$creationTime = str_replace('"', '', $date);
$entry = [
"id" => $row['id'],
"size" => (string) $row['size'],
"creation_time" => $creationTime,
'filename' => $row['filename'],
'bucket' => $row['bucket'] ? $row['bucket'] : '',
'client_id' => (string) $row['client_id'],
'attachment_id' => (string) $row['id'],
];
2023-05-30 18:39:44 +02:00
if ($row['bucket'] !== 'x') {
$entries[$dynamicValue][$row['filename']] = $entry;
}
2023-05-19 11:58:21 +02:00
} else {
$clientId = substr($row['id'], 0, 2);
2023-05-29 08:23:39 +02:00
$file = $row['client_id'] . self::DELIMITER . $row['id'];
2023-05-19 11:58:21 +02:00
$timestamp = (int) $row['created_on'];
$date = date('Y-m-d H:i:s', $timestamp);
$creationTime = str_replace('"', '', $date);
$entry = [
"id" => $file,
"size" => (string) $row['size'],
"creation_time" => $creationTime,
'filename' => $row['filename'],
'bucket' => $row['bucket'] ? $row['bucket'] : '',
'client_id' => (string) $row['client_id'],
'attachment_id' => (string) $row['id'],
];
if (!isset($entries[$clientId])) {
$entries[$clientId] = [];
}
2023-05-30 18:39:44 +02:00
if ($row['bucket'] !== 'x') {
$entries[$clientId][$file] = $entry;
}
2023-05-19 11:58:21 +02:00
}
}
$result = $result->nextPage();
}
// Merge all entries into a single array
$allEntries = [];
foreach ($entries as $clientEntries) {
$allEntries = array_merge($allEntries, $clientEntries);
}
// Create CSV file for each client ID's physical file entries
foreach ($entries as $clientId => $clientEntries) {
$this->createDBFileCSV($clientId, $clientEntries);
}
return $entries;
}
/**
* Compares a file entry with its corresponding entry in the Cassandra file association.
*
* @param string $id The ID of the file entry.
* @param array $data An array containing data of the file entry.
* @param array $cassandra_file_assoc The Cassandra file association.
* @return array|null An array containing mismatched entries, or null if the entries match.
*/
2023-05-19 11:58:21 +02:00
2023-05-29 08:23:39 +02:00
private function compareFileEntries(string $id, array $data, array $cassandra_file_assoc)
2023-05-19 11:58:21 +02:00
{
if (!isset($cassandra_file_assoc[$id])) {
return [
'id' => $id,
'file1' => [$id, trim($data[0], '"'), trim($data[1], '"')],
];
} else {
$physical_value1 = trim($data[0], '"');
$physical_value2 = trim($data[1], '"');
$cassandra_value1 = trim($cassandra_file_assoc[$id][0], '"');
$cassandra_value2 = trim($cassandra_file_assoc[$id][1], '"');
if ($physical_value1 !== $cassandra_value1 || $physical_value2 !== $cassandra_value2) {
return [
'id' => $id,
'file1' => [$id, $physical_value1, $physical_value2],
'file2' => [$id, $cassandra_value1, $cassandra_value2],
];
}
}
return null;
}
/**
* Retrieves the lines of a file and returns them as an array.
*
* @param string $file The path to the file.
* @return array An array containing the lines of the file.
*/
2023-05-29 08:23:39 +02:00
private function getFileLines(string $file): array
2023-05-19 11:58:21 +02:00
{
$file_contents = file_get_contents($file);
return explode("\n", $file_contents);
}
/**
* Filters out null entries and maps the remaining entries to their first element.
*
* @param array $entries An array containing entries to be filtered and mapped.
* @return array An array of filtered and mapped entries.
*/
2023-05-29 08:23:39 +02:00
private function filterAndMapEntries(array $entries): array
2023-05-19 11:58:21 +02:00
{
$filtered_entries = array_filter($entries);
$mapped_entries = array_map(function ($entry) {
return $entry[0];
}, $filtered_entries);
return array_values($mapped_entries);
}
/**
* Processes files in a directory, performs comparisons, and generates CSV and HTML reports.
*
* @param string $dir The directory path containing the files to be processed.
* @return void
*/
2023-05-29 08:23:39 +02:00
private function process_files_in_directory(string $dir): void
2023-05-19 11:58:21 +02:00
{
$files = glob($dir . '/*.csv');
$physical_files = array();
$cassandra_files = array();
foreach ($files as $file) {
2023-05-29 08:23:39 +02:00
$filename = basename($file, static::CSV_EXTENSION);
$csv_type = substr(
$filename,
0,
strpos($filename, '_')
);
$file_num = substr(
$filename,
strpos($filename, '_') + 1
);
if ($csv_type === 'physical') {
2023-05-19 11:58:21 +02:00
$physical_files[$file_num] = $file;
2023-05-29 08:23:39 +02:00
} elseif ($csv_type === 'cassandra') {
2023-05-19 11:58:21 +02:00
$cassandra_files[$file_num] = $file;
}
}
ksort($physical_files);
ksort($cassandra_files);
$missing_physical_files = array();
$missing_cassandra_entries = array();
$cassandra_entries = [];
$physical_entries = [];
foreach ($physical_files as $file_num => $physical_file) {
2023-05-29 08:23:39 +02:00
$cassandra_file = $dir . '/cassandra_' . $file_num . self::CSV_EXTENSION;
2023-05-19 11:58:21 +02:00
if (file_exists($physical_file) && file_exists($cassandra_file)) {
$compared_physical[] = $this->compare_csv_files($physical_file, $cassandra_file);
$compared_cassandra[] = $this->compare_csv_files($cassandra_file, $physical_file);
$physical_entries = $this->filterAndMapEntries($compared_cassandra);
$cassandra_entries = $this->filterAndMapEntries($compared_physical);
} else {
if (!file_exists($physical_file)) {
$missing_physical_files[] = $physical_file;
2023-05-19 11:58:21 +02:00
}
if (!file_exists($cassandra_file)) {
$missing_cassandra_files[] = $cassandra_file;
}
}
if (!file_exists($cassandra_file)) {
$physical_file_lines = $this->getFileLines($physical_file);
$file_assoc = array();
foreach ($physical_file_lines as $line) {
if ($line === reset($physical_file_lines)) {
continue;
}
2023-05-29 08:23:39 +02:00
$values = explode(',', $line);
if (count($values) === self::NUM_FIELDS) {
2023-05-19 11:58:21 +02:00
$values[2] = str_replace('"', '', $values[2]);
$file_assoc[$values[0]] = [$values[1], $values[2]];
}
}
foreach ($file_assoc as $id => $data) {
2023-05-19 11:58:21 +02:00
if (!isset($cassandra_files_assoc[$id])) {
$missing_cassandra_entries[] = [
'id' => $id,
'file1' => [$id, $data[0], $data[1]],
];
}
}
} else {
2023-05-19 11:58:21 +02:00
}
}
foreach ($cassandra_files as $file_num => $cassandra_file) {
2023-05-29 08:23:39 +02:00
$physical_file = $dir . '/physical_' . $file_num . self::CSV_EXTENSION;
2023-05-19 11:58:21 +02:00
if (!file_exists($physical_file)) {
$cassandra_file_lines = $this->getFileLines($cassandra_file);
2023-05-19 11:58:21 +02:00
$file_assoc = $this->buildFileAssociation($cassandra_file_lines);
foreach ($file_assoc as $id => $data) {
2023-05-19 11:58:21 +02:00
if (!isset($physical_files_assoc[$id])) {
$missing_physical_files[] = [
'id' => $id,
'file1' => [$id, $data[0], $data[1], $data[2], $data[3], $data[4], $data[5]],
2023-05-19 11:58:21 +02:00
];
}
}
}
}
$result_from_cassandra_entries = array_unique(array_merge($cassandra_entries, $missing_cassandra_entries), SORT_REGULAR);
$result_from_physical_files = array_unique(array_merge($physical_entries, $missing_physical_files), SORT_REGULAR);
2023-05-29 08:23:39 +02:00
$this->generateCsvReportForDbEntries($result_from_cassandra_entries, self::CASSANDRA_RESULT_CSV);
$this->generateHtmlReport($result_from_cassandra_entries, self::CASSANDRA_HTML_REPORT);
$this->generateCsvReportForPhysicalFiles($result_from_physical_files, self::PHYSICAL_RESULT_CSV);
$this->generateHtmlReport($result_from_physical_files, self::PHYSICAL_HTML_REPORT);
2023-05-19 11:58:21 +02:00
}
2023-05-29 08:23:39 +02:00
/**
* Builds an associative array from file lines.
*
* @param array $file_lines An array containing lines of a file.
* @return array An associative array representing the file association.
*/
2023-05-29 08:23:39 +02:00
private function buildFileAssociation(array $file_lines): array
2023-05-19 11:58:21 +02:00
{
$file_assoc = [];
foreach ($file_lines as $line) {
if ($line === reset($file_lines)) {
continue;
}
2023-05-29 08:23:39 +02:00
$values = explode(',', $line);
if (count($values) === self::NUM_FIELDS) {
2023-05-19 11:58:21 +02:00
$file_assoc[$values[0]] = [$values[1], $values[2]];
}
2023-05-29 08:23:39 +02:00
if (count($values) > 3) {
$file_assoc[$values[0]] = [$values[1], $values[2], $values[3], $values[4], $values[5], $values[6]];
2023-05-19 11:58:21 +02:00
}
}
return $file_assoc;
}
/**
* Compares two CSV files and returns missing entries or entries with mismatched data.
*
* @param string $file1_path The file path of the first CSV file.
* @param string $file2_path The file path of the second CSV file.
* @return array An array containing missing entries or entries with mismatched data.
*/
2023-05-19 11:58:21 +02:00
2023-05-29 08:23:39 +02:00
private function compare_csv_files(string $file1_path, string $file2_path): array
2023-05-19 11:58:21 +02:00
{
$file1_data = array_map('str_getcsv', file($file1_path));
$file2_data = array_map('str_getcsv', file($file2_path));
$file1_headers = array_shift($file1_data);
$file2_headers = array_shift($file2_data);
// find indexes of columns in each file
$id_index_1 = array_search('id', $file1_headers);
$id_index_2 = array_search('id', $file2_headers);
$size_index_1 = array_search('size', $file1_headers);
$size_index_2 = array_search('size', $file2_headers);
$time_index_1 = array_search('creation_time', $file1_headers);
$time_index_2 = array_search('creation_time', $file2_headers);
$filename_index_1 = array_search('filename', $file1_headers);
$filename_index_2 = array_search('filename', $file2_headers);
$clientid_index_1 = array_search('client_id', $file1_headers);
$bucket_index_1 = array_search('bucket', $file1_headers);
$attachment_id_index_1 = array_search('attachment_id', $file1_headers);
2023-05-29 08:23:39 +02:00
$file1_assoc = array_reduce(
$file1_data, function ($result, $row) use (
$id_index_1,
$size_index_1,
$time_index_1,
$filename_index_1,
$clientid_index_1,
$bucket_index_1,
$attachment_id_index_1
) {
2023-05-19 11:58:21 +02:00
$result[$row[$id_index_1]] = [
'id' => $row[$id_index_1],
'file1' => [
$row[$id_index_1],
$row[$size_index_1],
$row[$time_index_1],
isset($row[$filename_index_1]) ? $row[$filename_index_1] : null,
isset($row[$clientid_index_1]) ? $row[$clientid_index_1] : null,
isset($row[$bucket_index_1]) ? $row[$bucket_index_1] : null,
2023-05-19 11:58:21 +02:00
isset($row[$attachment_id_index_1]) ? $row[$attachment_id_index_1] : null
],
];
return $result;
}, []);
2023-05-29 08:23:39 +02:00
$file2_assoc = array_reduce(
$file2_data, function ($result, $row) use (
$id_index_2,
$size_index_2,
$time_index_2,
$filename_index_2
) {
2023-05-19 11:58:21 +02:00
$result[$row[$id_index_2]] = [
'id' => $row[$id_index_2],
'file2' => [
$row[$id_index_2],
$row[$size_index_2],
$row[$time_index_2],
isset($row[$filename_index_2]) ? $row[$filename_index_2] : null
],
];
return $result;
}, []);
$missing_entries = [];
foreach ($file1_assoc as $id => $data) {
if (!isset($file2_assoc[$id])) {
$missing_entries[] = [
'id' => $id,
'file1' => $data['file1'],
];
} else {
$file2_data = $file2_assoc[$id]['file2'];
if ($data['file1'][0] !== $file2_data[0] || $data['file1'][1] !== $file2_data[1]) {
2023-05-19 11:58:21 +02:00
$missing_entries[] = [
'id' => $id,
'file1' => $data['file1'],
'file2' => $file2_data,
];
}
}
}
return $missing_entries;
}
/**
* Generates a CSV report of inconsistent files.
*
* @param array $inconsistentFiles An array of inconsistent files.
* @param string $filename The filename to use for the report.
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
private function generateCsvReportForPhysicalFiles(array $inconsistentFiles, string $name): void
{
$fp = fopen($name, 'w');
2023-05-29 08:23:39 +02:00
fputcsv($fp, [
self::CSV_COLUMN_FILE_ATTACHMENT,
self::CSV_COLUMN_FILE_PATH,
self::CSV_COLUMN_FILE_NAME,
self::CSV_COLUMN_THUMB1,
self::CSV_COLUMN_THUMB2,
self::CSV_COLUMN_SIZE,
self::CSV_COLUMN_CREATION_TIME,
self::CSV_COLUMN_CLIENT_ID,
self::CSV_COLUMN_BUCKET,
self::CSV_COLUMN_ID
]);
2023-05-19 11:58:21 +02:00
foreach ($inconsistentFiles as $row) {
$check_value = $row['id'];
$filePath = $check_value ? $check_value : $row['file2'][0];
2023-05-19 11:58:21 +02:00
if (is_numeric($row['id'])) {
$filePath = $row['file1'][3];
2023-05-19 11:58:21 +02:00
$check_value = $row['file1'][3];
}
2023-05-19 11:58:21 +02:00
$size = isset($row['file1'][1]) ? (string) $row['file1'][1] : filesize($filePath);
$creationTime = isset($row['file1'][2]) ? (string) $row['file1'][2] : date('Y-m-d H:i:s', filectime($filePath));
$thumb1 = isset($row['thumb1']) ? $row['thumb1'] : '';
$thumb2 = isset($row['thumb2']) ? $row['thumb2'] : '';
2023-05-30 18:39:44 +02:00
$bucket = isset($row['file1'][4]) ? $row['file1'][4] : '';
$clientId = isset($row['file1'][5]) ? $row['file1'][5] : '';
$attachmentId = isset($row['file1'][6]) ? $row['file1'][6] : '';
2023-05-19 11:58:21 +02:00
fputcsv($fp, [
'Attachment',
$filePath,
$check_value,
$thumb1,
$thumb2,
$size,
$creationTime,
$clientId,
$bucket,
$attachmentId,
]);
}
fclose($fp);
chmod($name, 0666);
}
/**
* Generates a CSV report for inconsistent database entries.
*
* @param array $inconsistentFiles An array containing inconsistent file data.
* @param string $name The name of the CSV report file to be generated.
* @return void
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
private function generateCsvReportForDbEntries(array $inconsistentFiles, string $name): void
{
$fp = fopen($name, 'w');
2023-05-29 08:23:39 +02:00
fputcsv($fp, [
self::CSV_COLUMN_FILE_ATTACHMENT,
'Entry Path',
'Entry Name',
self::CSV_COLUMN_THUMB1,
self::CSV_COLUMN_THUMB2,
self::CSV_COLUMN_SIZE,
self::CSV_COLUMN_CREATION_TIME
]);
2023-05-19 11:58:21 +02:00
foreach ($inconsistentFiles as $row) {
2023-05-30 18:39:44 +02:00
$filePath = $this->directory . DIRECTORY_SEPARATOR . $row['file1'][3] ? $this->directory . DIRECTORY_SEPARATOR . $row['file1'][3] : $this->directory . DIRECTORY_SEPARATOR . $row['file2'][3];
2023-05-19 11:58:21 +02:00
$size = isset($row['file1'][1]) ? (string) $row['file1'][1] : filesize($filePath);
$creationTime = isset($row['file1'][2]) ? (string) $row['file1'][2] : date('Y-m-d H:i:s', filectime($filePath));
2023-05-30 18:39:44 +02:00
$thumb1 = $row['file1'][3] . '-thumb1';
$thumb2 = $row['file1'][3] . '-thumb2';
if (is_string($row['id']) && strpos($row['id'], '.') !== false) {
$old_attachment = explode('.', $row['id'])[0];
$thumb1 = '';
$thumb2 = '';
}
2023-05-19 11:58:21 +02:00
fputcsv($fp, [
'File',
2023-05-19 11:58:21 +02:00
$filePath,
$row['file1'][0],
$thumb1,
$thumb2,
$size,
$creationTime
2023-05-19 11:58:21 +02:00
]);
}
fclose($fp);
chmod($name, 0666);
}
/**
* Generates an HTML report of inconsistent files.
*
* @param array $inconsistentFiles An array of inconsistent files.
* @param string $filename The filename to use for the report.
*/
2023-05-29 08:23:39 +02:00
private function generateHtmlReport(array $inconsistentFiles, string $name): void
{
$templateFile = 'report_template.html';
$templateContent = file_get_contents($templateFile);
if ($templateContent === false) {
throw new Exception('Failed to read the HTML template file.');
}
$tableRows = '';
foreach ($inconsistentFiles as $row) {
$filename = $row['file1'][0];
2023-05-30 18:39:44 +02:00
$filePath = $this->directory . DIRECTORY_SEPARATOR . $filename;
2023-05-29 08:23:39 +02:00
$size = isset($row['file1'][1]) ? (string) $row['file1'][1] : filesize($filePath);
$creationTime = isset($row['file1'][2]) ? str_replace('"', '', $row['file1'][2]) : date('Y-m-d H:i:s', filectime($filePath));
$thumb1 = $row['file1'][0] . '-thumb1';
$thumb2 = $row['file1'][0] . '-thumb2';
if (isset($row['file1'][3]) && preg_match('/^[0-9]+\./', $row['file1'][3])) {
$filename = $row['file1'][3];
2023-05-30 18:39:44 +02:00
$filePath = $this->directory . DIRECTORY_SEPARATOR . $filename;
2023-05-29 08:23:39 +02:00
$thumb1 = '';
$thumb2 = '';
2023-05-19 11:58:21 +02:00
}
2023-05-29 08:23:39 +02:00
if (is_string($row['id']) && strpos($row['id'], '.') !== false) {
$filename = $row['id'];
2023-05-30 18:39:44 +02:00
$filePath = $this->directory . DIRECTORY_SEPARATOR . $filename;
2023-05-29 08:23:39 +02:00
$thumb1 = '';
$thumb2 = '';
}
$tableRows .= '<tr><td>Attachment</td><td>' . htmlspecialchars($filePath) . '</td><td>' . htmlspecialchars($filename) . '</td><td>' . htmlspecialchars($thumb1) . '</td><td>' . htmlspecialchars($thumb2) . '</td><td>' . htmlspecialchars($size) . '</td><td>' . htmlspecialchars($creationTime) . '</td></tr>';
}
$html = str_replace('{{table_rows}}', $tableRows, $templateContent);
$file = fopen($name, 'w');
if (!$file) {
throw new Exception('Failed to open the file for writing.');
}
fwrite($file, $html);
fclose($file);
}
2023-05-19 11:58:21 +02:00
/**
* Returns cassandra schema version
*
*
* @return int
*/
2023-05-29 08:23:39 +02:00
private function schema_version(): int
2023-05-19 11:58:21 +02:00
{
return static::$schemaVersion;
}
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
/**
* Returns info(id,size) for attachment file
*
* @param int $clientId
* @param string|null $bucketId
* @param string $id
2023-05-29 08:23:39 +02:00
* @return object|null
2023-05-19 11:58:21 +02:00
*/
2023-05-29 08:23:39 +02:00
2023-05-30 18:39:44 +02:00
private function get_info(int $clientId, ?string $bucketId, string $id): ?object
2023-05-19 11:58:21 +02:00
{
$attachment = null;
$args = [
'client_id' => $clientId,
'id' => $id
];
if ($this->schema_version() == 1) {
$query = $this->cassandra->prepare('SELECT * FROM attachment_file_info WHERE id = ? AND client_id = ?');
} else {
$q = 'SELECT * FROM attachment_file_info WHERE id = ? AND client_id = ? AND bucket = \'' . $bucketId . '\'';
2023-05-19 11:58:21 +02:00
$query = $this->cassandra->prepare($q);
}
$res = $this->cassandra->execute(
$query,
[
'arguments' => $args
]
);
if ($res && $res->valid()) {
$tmp = $res->current();
$attachment = (object) $tmp;
$attachment->id = (string) $tmp['id'];
if (array_key_exists('size', $tmp)) {
$attachment->size = (int) $tmp['size'];
}
}
2023-05-19 11:58:21 +02:00
return $attachment;
}
/**
* prepares 'bucket' field for partitioning
*
* @param string $id
* @return string
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
private function _set_bucket(string $id): string
{
return substr($id, 0, static::$bucketMagic);
}
2023-05-19 11:58:21 +02:00
/**
* updates attachment count and size
*
* @param boolean $add
* @param integer $size
*
* @return void
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
private function _update_attachment_stats(bool $add, int $size): void
{
2023-05-29 08:23:39 +02:00
$op = $add ? '+' : self::DELIMITER;
2023-05-19 11:58:21 +02:00
$query = $this->cassandra->prepare('UPDATE attachment_stats SET count = count ' . $op . ' 1 where client_id = ?');
$this->cassandra->execute($query, ['arguments' => ['client_id' => (int) static::$clientId]]);
$query = $this->cassandra->prepare(
'UPDATE attachment_stats SET size = size ' . $op . ' ' . $size . ' where client_id = ?'
);
$this->cassandra->execute($query, ['arguments' => ['client_id' => static::$clientId]]);
}
/**
* returns full attachment table key for given id
*
* @param string $id
*
* @return object|null
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
private function _get_attachment_key(string $id): ?object
{
$result = null;
$query = $this->cassandra->prepare('SELECT * from attachment_ids where id = ? AND bucket = ? AND client_id = ?');
$arguments = [
'client_id' => static::$clientId,
'bucket' => $this->_set_bucket($id),
'id' => $id,
];
$data = $this->cassandra->execute($query, ['arguments' => $arguments]);
2023-05-30 18:39:44 +02:00
2023-05-19 11:58:21 +02:00
if ($data && $data->valid()) {
$result = (object) $data->current();
}
2023-05-19 11:58:21 +02:00
return $result;
}
/**
* returns single attachment data for provided ID
*
* @param string $attachmentId
*
2023-05-29 08:23:39 +02:00
* @return object|null
2023-05-19 11:58:21 +02:00
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
private function _get_attachment_by_id(string $attachmentId): ?object
{
$attachment = null;
try {
$properties = '*';
$key = $this->_get_attachment_key($attachmentId);
2023-05-19 11:58:21 +02:00
if ($key) {
if ($this->schema_version() === 1) {
$query = $this->cassandra->prepare('SELECT ' . $properties . ' FROM attachments WHERE id = ? AND client_id = ? AND project_id = ? AND entity_type = ?');
2023-05-19 11:58:21 +02:00
$arguments = [
'client_id' => static::$clientId,
'id' => $key->id,
'project_id' => $key->project_id,
'entity_type' => $key->entity_type,
];
} else {
$query = $this->cassandra->prepare('SELECT ' . $properties . ' FROM attachments WHERE id = ? AND client_id = ? AND project_id = ? AND entity_type = ? AND entity_id = ?');
$arguments = [
'client_id' => static::$clientId,
'id' => $key->id,
'project_id' => $key->project_id,
'entity_id' => $key->entity_id,
'entity_type' => $key->entity_type,
];
}
$attachment = $this->cassandra->execute($query, ['arguments' => $arguments]);
}
} catch (Cassandra\Exception\InvalidArgumentException $e) {
}
2023-05-19 11:58:21 +02:00
return $attachment != null && $attachment->valid() ? $this->_convert_to_object($attachment->current()) : null;
}
/**
* deletes attachment_ids entry
*
* @param string $id
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
private function _delete_attachment_key(string $id): void
{
2023-05-29 08:23:39 +02:00
$this->cassandra->executeAsync(
$this->cassandra->prepare('DELETE FROM attachment_ids WHERE id = ? AND bucket = ? AND client_id = ?'),
[
'arguments' => [
'client_id' => static::$clientId,
'bucket' => $this->_set_bucket($id),
'id' => $id
]
]
);
2023-05-19 11:58:21 +02:00
}
/**
* updates attachment data references
*
* @param string $data_id
* @param bool $add
* @param string $attachmentId
*
* @return void
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
private function _update_file_refs(string $data_id, bool $add, string $attachmentId = ''): void
{
$queryArguments = [
'client_id' => static::$clientId,
'bucket' => $this->_set_bucket($data_id),
'id' => $data_id,
];
2023-05-29 08:23:39 +02:00
$query = $this->cassandra->prepare('UPDATE attachment_file_refs SET ref_count = ref_count ' . ($add ? '+' : self::DELIMITER) . ' 1 WHERE bucket = ? AND id = ? AND client_id = ?');
2023-05-19 11:58:21 +02:00
$this->cassandra->executeAsync($query, ['arguments' => $queryArguments]);
if (!empty($attachmentId)) {
$queryArguments['attachment_id'] = $attachmentId;
if ($add) {
$query = $this->cassandra->prepare('INSERT INTO attachment_file_ids (client_id,bucket,id,attachment_id) VALUES(?,?,?,?)');
} else {
$query = $this->cassandra->prepare('DELETE FROM attachment_file_ids WHERE client_id = ? AND bucket = ? AND attachment_id = ? AND id = ?');
}
$this->cassandra->executeAsync($query, ['arguments' => $queryArguments]);
}
}
/**
* Deletes attachment from Cassandra
*
*
* @param int $clientId
* @param string|null $bucketId
* @param string $id
*
* @return bool
*/
2023-05-29 08:23:39 +02:00
2023-05-19 11:58:21 +02:00
public function deleteAttachment(int $clientId, ?string $bucketId, string $id): bool
{
static::$clientId = $clientId;
2023-05-19 11:58:21 +02:00
$refData = [
'client_id' => $clientId,
'id' => $id,
];
$fileInfo = $this->get_info($clientId, $bucketId, $id);
2023-05-19 11:58:21 +02:00
if ($fileInfo) {
$this->_update_attachment_stats(false, $fileInfo->size);
} else {
return false;
}
$q = 'DELETE FROM attachment_file_info WHERE id = ? AND client_id = ? ';
if ($this->schema_version() === 2) {
$q = $q . ' AND bucket = \'' . $bucketId . '\'';
}
2023-05-19 11:58:21 +02:00
$query = $this->cassandra->prepare($q);
$this->cassandra->execute($query, ['arguments' => $refData]);
2023-05-19 11:58:21 +02:00
$refData['bucket'] = $this->_set_bucket($id);
$query = $this->cassandra->prepare(
'DELETE FROM attachment_file_refs WHERE bucket = ? AND id = ? AND client_id = ?'
);
$result = $this->cassandra->execute($query, ['arguments' => $refData]);
// get all attachments with deleted file and remove them
$query = $this->cassandra->prepare(
'SELECT attachment_id FROM attachment_file_ids WHERE bucket = ? AND id = ? AND client_id = ?'
);
$result = $this->cassandra->execute($query, ['arguments' => $refData]);
if ($this->schema_version() === 1) {
$delQuery = $this->cassandra->prepare(
'DELETE FROM attachments WHERE id = ? AND client_id = ? AND project_id = ? AND entity_type = ?'
);
} else {
$delQuery = $this->cassandra->prepare(
'DELETE FROM attachments WHERE id = ? AND client_id = ? AND project_id = ? AND entity_type = ? AND entity_id = ?'
);
}
2023-05-19 11:58:21 +02:00
while ($result && $result->valid()) {
$attachmentId = $result->current()['attachment_id'];
$key = $this->_get_attachment_key((string) $attachmentId);
if ($key == null) {
$result->next();
continue;
}
if ($this->schema_version() === 1) {
2023-05-19 11:58:21 +02:00
$selectQuery = $this->cassandra->prepare(
'SELECT entity_id FROM attachments WHERE id = ? AND client_id = ? AND project_id = ? AND entity_type = ?'
);
$attachment = $this->cassandra->execute($selectQuery, [
'arguments' => [
'client_id' => static::$clientId,
'project_id' => $key->project_id,
'entity_type' => $key->entity_type,
'id' => $attachmentId,
]
]);
$entity_id = $attachment->current()['entity_id'];
$delArgs = [
'client_id' => static::$clientId,
'project_id' => $key->project_id,
'entity_type' => $key->entity_type,
'id' => $attachmentId,
];
} else {
$entity_id = $key->entity_id;
$delArgs = [
'client_id' => static::$clientId,
'project_id' => $key->project_id,
'entity_type' => $key->entity_type,
'entity_id' => $entity_id,
'id' => $attachmentId,
];
}
$this->_delete_attachment_key((string) $attachmentId);
$this->cassandra->execute($delQuery, ['arguments' => $delArgs]);
$result->next();
}
$query = $this->cassandra->prepare(
'DELETE FROM attachment_file_ids WHERE bucket = ? AND id = ? AND client_id = ?'
);
$this->cassandra->execute($query, ['arguments' => $refData]);
$result = false;
$attachment = $this->_get_attachment_by_id($id);
if ($attachment) {
if ($this->schema_version() === 1) {
$query = $this->cassandra->prepare("DELETE FROM attachments WHERE id = ? AND project_id = ? AND entity_type = ? AND client_id = ?");
$arguments = [
'arguments' => [
'client_id' => static::$clientId,
'id' => $id,
'project_id' => $attachment->project_id,
'entity_type' => $attachment->entity_type
],
];
} else {
$query = $this->cassandra->prepare("DELETE FROM attachments WHERE id = ? AND project_id = ? AND entity_id = ? AND entity_type = ? AND client_id = ?");
$arguments = [
'arguments' => [
'client_id' => static::$clientId,
'id' => $id,
'project_id' => $attachment->project_id,
'entity_type' => $attachment->entity_type,
'entity_id' => $attachment->entity_id
],
];
}
$queryResult = $this->cassandra->execute($query, $arguments) != null;
if ($queryResult) {
$result = true;
$this->_update_file_refs($attachment->data_id, false, $attachment->id);
$this->_delete_attachment_key($id);
}
}
2023-05-19 11:58:21 +02:00
return $result;
}
/**
* Deletes attachments provided in file
*
*
* @param string $file
* @param string $src ('cassandra'/'file')
*
* @return void
*
* example file:
*
* cassandra,/test1/testx,testx,testx-thumb1,testx-thumb2,4343,20-02-22 13:30,1,a,abc-def
* cassandra,/test1/testx1,testx1,testx1-thumb1,testx1-thumb2,4343,20-02-22 13:30,1,1,1abc-def
* file,/tmp/testx3f,testx3f,testx3f-thumb1,testx3f-thumb2,4343,20-02-22 13:30,,,
* cassandra,/test1/testx3,testx3,testx3-thumb1,testx3-thumb2,4343,20-02-22 13:30,1,3,3abc-def
*/
2023-05-29 08:23:39 +02:00
public function processAttachmentDeletionCSV(string $file, string $src): void
2023-05-19 11:58:21 +02:00
{
echo "Before proceeding with the deletion, make sure you have a backup of your data." . PHP_EOL;
echo "You can revert back to the backup in case of accidental data loss." . PHP_EOL;
echo "Do you want to delete the physical files listed in the CSV report? (yes/no): ";
$confirmation = trim(fgets(STDIN));
if (strtolower($confirmation) !== 'yes') {
return;
}
2023-05-30 18:39:44 +02:00
$this->init();
2023-05-19 11:58:21 +02:00
$file_contents = file_get_contents($file);
$file_lines = explode("\n", $file_contents);
$logFile = 'deleted_files.log';
$logHandle = fopen($logFile, 'a');
foreach ($file_lines as $line) {
2023-05-30 18:39:44 +02:00
2023-05-19 11:58:21 +02:00
if ($line === reset($file_lines)) {
continue;
}
2023-05-29 08:23:39 +02:00
$values = explode(',', $line);
2023-05-19 11:58:21 +02:00
if ($values[0] === 'File/Attachment') {
continue;
2023-05-19 11:58:21 +02:00
}
2023-05-30 18:39:44 +02:00
2023-05-19 11:58:21 +02:00
if (count($values) >= 7) {
$data = (object) array();
$data->source = $values[0];
2023-05-19 11:58:21 +02:00
$data->path = $values[1];
$data->name = $values[2];
$data->thumb1 = $values[3];
$data->thumb2 = $values[4];
$data->size = $values[5];
$data->created = $values[6];
$path = dirname($data->path);
2023-05-29 08:23:39 +02:00
if (count($values) >= 10) {
2023-05-19 11:58:21 +02:00
$data->clientId = (int) $values[7];
$data->bucket = $values[8];
$data->id = $values[9];
}
2023-05-29 08:23:39 +02:00
if ($data->source === 'Attachment' && $src === 'cassandra') {
echo "will delete " . $data->clientId . " : " . $data->bucket . " : " . $data->id . PHP_EOL;
$this->deleteAttachment($data->clientId, $data->bucket, $data->id);
fwrite($logHandle, "Deleted attachment: $data->id" . PHP_EOL);
} else if ($data->source === 'File' && $src === 'file') {
$filePath = $values[1];
2023-05-30 18:39:44 +02:00
$thumb1Path = $path . DIRECTORY_SEPARATOR . $values[3];
$thumb2Path = $path . DIRECTORY_SEPARATOR . $values[4];
$filePath = str_replace('"', '', $filePath);
$thumb1Path = str_replace('"', '', $thumb1Path);
$thumb2Path = str_replace('"', '', $thumb2Path);
2023-05-29 08:23:39 +02:00
if (file_exists($thumb1Path) && file_exists($thumb2Path)) {
2023-05-30 18:39:44 +02:00
unlink($thumb1Path);
unlink($thumb2Path);
fwrite($logHandle, "Deleted thumbnail: " . $thumb1Path . PHP_EOL);
fwrite($logHandle, "Deleted thumbnail: " . $thumb2Path . PHP_EOL);
}
2023-05-29 08:23:39 +02:00
if (file_exists($filePath)) {
unlink($filePath);
unlink($thumb1Path);
unlink($thumb2Path);
echo "File deleted: $filePath" . PHP_EOL;
2023-05-19 11:58:21 +02:00
// Write the deleted file path to the log file
fwrite($logHandle, "Deleted file: $filePath" . PHP_EOL);
} else {
echo "File not found: $filePath" . PHP_EOL;
2023-05-19 11:58:21 +02:00
}
}
}
}
fclose($logHandle);
echo "Deletion completed. The list of attachments is saved in '$logFile'." . PHP_EOL;
}
}
$checker = new DataConsistencyChecker();
2023-05-30 18:39:44 +02:00
$checker->checkConsistency("attachment_file_info", true);