Skip to content

Commit

Permalink
issue #32 - date anonymizer
Browse files Browse the repository at this point in the history
  • Loading branch information
pounard committed Mar 15, 2024
1 parent adaae94 commit 48691a5
Show file tree
Hide file tree
Showing 6 changed files with 590 additions and 3 deletions.
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"php": ">=8.1",
"doctrine/doctrine-bundle": "^2.10.0",
"doctrine/orm": "^2.15",
"makinacorpus/query-builder": "^0.3.1",
"makinacorpus/query-builder": "^1.1.3",
"symfony/config": "^6.0|^7.0",
"symfony/console": "^6.0|^7.0",
"symfony/dependency-injection": "^6.0|^7.0",
Expand Down
86 changes: 86 additions & 0 deletions docs/content/anonymization/core-anonymizers.md
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,92 @@ customer:
```
:::

## DateAnonymizer

Anonymize dates by either:
- randomly choosing an date or datetime in a given range delimited by `min` and `max` options,
- altering the initial value by adding it a random value picked in a range computed from the `delta` options.

`min` and `max` options can be any string that can be parsed as a date by the `DateTime`
class constructor, for example:
- an absolute date: `2024-03-15` or datetime: `2024-03-15 10:28:56`,
- a relative time: `now +2 hours`, `-3 month`, ...

`delta` option can be either:
- an ISO interval specification, such as: `P1DT1M` (1 day and 1 minute),
- a human readable date string that PHP can parse: `1 month -3 day +3 minutes`.

You can additionnally set the `format` parameter:
- `date` will cast the generated date as a date without time,
- `datetime` will generate a full timestamp.

::: code-group
```php [Attribute]
namespace App\Entity;

use Doctrine\ORM\Mapping as ORM;
use MakinaCorpus\DbToolsBundle\Attribute\Anonymize;

#[ORM\Entity()]
#[ORM\Table(name: 'customer')]
class Customer
{
// ...

#[ORM\Column]
// Will add to the existing date a random interval // [!code ++]
// in the [-delta, +delta] interval. // [!code ++]
#[Anonymize(type: 'date', options: ['delta' => '1 month 15 day')] // [!code ++]
private ?\DateTime $birthDate = null;

#[ORM\Column]
// Will pick a random date in the given // [!code ++]
// in the [min, max] interval // [!code ++]
#[Anonymize(type: 'date', options: ['min' => 'now -3 month', 'max' => 'now'])] // [!code ++]
private ?\DateTimeImmutable $lastLogin = null;

#[ORM\Column]
// And example with absolute dates. // [!code ++]
#[Anonymize(type: 'date', options: ['min' => '1789-05-05', 'max' => '2024-03-15', 'format' => 'date')] // [!code ++]
private ?\DateTime $createdAt = null;
}
```

```yml [YAML]
# config/anonymization.yaml

customer:
# Will add to the existing date a random interval in the [-delta, +delta] interval.
birthDate:
anonymizer: date
options: {delta: '1 month 15 day'}

customer:
# Will pick a random date in the given in the [min, max] interval.
lastLogin:
anonymizer: date
options: {min: 'now -3 month', max: 'now'}

customer:
# And example with absolute dates.
createdAt:
anonymizer: date
options: {min: '1789-05-05', max: '2024-03-15', format: 'date'}

#...
```
:::

:::warning
Dates you give for `min` and `max` values will inherit from the PHP default
configured timezone.
:::

:::note
When using a date range over 68 years, random granularity stops at the hour
in order to avoid date add operation to be given an overflowing int value.
:::

## NullAnonymizer

Set all values to `NULL`.
Expand Down
201 changes: 201 additions & 0 deletions src/Anonymization/Anonymizer/Core/DateAnonymizer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
<?php

declare(strict_types=1);

namespace MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Core;

use Doctrine\DBAL\Platforms\AbstractMySQLPlatform;
use Doctrine\DBAL\Platforms\SqlitePlatform;
use MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\AbstractAnonymizer;
use MakinaCorpus\DbToolsBundle\Attribute\AsAnonymizer;
use MakinaCorpus\QueryBuilder\Query\Update;

#[AsAnonymizer(
name: 'date',
pack: 'core',
description: <<<TXT
Anonymize a column by changing the date it contains.
You can either choose a 'min' and a 'max' date, case in which a random date will
be selected between these bounds, or alternatively set a 'delta' which must be
a valid date interval string (e.g. "1 week", "1 day 10 hours", ...).
You should set the 'format' (default: 'datetime') value as this anonymizator
can work with 'datetime' or 'date' formats.
TXT
)]
class DateAnonymizer extends AbstractAnonymizer
{
#[\Override]
public function anonymize(Update $update): void
{
$format = $this->options->get('format', 'datetime');
if (!\in_array($format, ['date', 'datetime'])) {
throw new \InvalidArgumentException(\sprintf("'format' value is invalid, expected 'date' or 'datetime', got '%s'.", $format));
}

$min = $max = null;
if ($min = $this->options->get('min')) {
$min = $this->parseDate($min, 'min');
}
if ($max = $this->options->get('max')) {
$max = $this->parseDate($max, 'max');
}
if (($min && !$max) || ($max && !$min)) {
throw new \InvalidArgumentException("You must specify both 'min' and 'max' boundaries.");
}
// @phpstan-ignore-next-line False positive detected.
if ($min && $max) {
if ($this->options->get('delta')) {
throw new \InvalidArgumentException("'delta' option cannot be specified if 'min' and 'max' are in use.");
}

$this->anonymizeWithDateRange($update, $format, $min, $max);

return;
}

if ($delta = $this->options->get('delta')) {
try {
$delta = new \DateInterval($delta);
} catch (\Throwable) {
if (!$delta = \DateInterval::createFromDateString($delta)) {
throw new \InvalidArgumentException("'delta' option interval string format is invalid.");
}
}

$this->anonmizeWithDelta($update, $format, $delta);

return;
}

throw new \InvalidArgumentException("Providing either the 'delta' option, or both 'min' and 'max' options is required.");
}

private function anonymizeWithDateRange(Update $update, string $format, \DateTimeImmutable $min, \DateTimeImmutable $max): void
{
$diff = $max->diff($min, true);

if ('date' === $format) {
// Compute a diff in number of days.
$unit = 'day';
$delta = $diff->d + $diff->m * 30 + $diff->y * 360;
} elseif (68 < $diff->y) {
// We hit UNIX timestamp maximum integer limit, and may cause
// int overflow or other kind of crashes server side. In order to
// do this, we lower the granularity to hours.
$unit = 'hour';
$delta = $diff->h + $diff->d * 24 + $diff->m * 720 + $diff->y * 8640;
} else {
$unit = 'second';
$delta = $diff->s + $diff->i * 60 + $diff->h * 3600 + $diff->d * 86400 + $diff->m * 2592000 + $diff->y * 31104000;
}

// Cut in half to compute middle date.
$delta /= 2;
$middleDate = $min->add(\DateInterval::createFromDateString(\sprintf("%d %s", $delta, $unit)));

$this->anonymizeWithDeltaAndReferenceDate($update, $format, $middleDate, $delta, $unit);
}

private function anonmizeWithDelta(Update $update, string $format, \DateInterval $delta): void
{
// @todo I wish for a better alternative...
// query-builder can deal with \DateInterval by- itself, but we are
// randomizing values here, so we need to be able to apply a single
// figure random delta, in order to be able to use SQL random at the
// right place, otherwise the algorithm would be very complex..
// In order to achieve this, we arbitrarily converted a month to 30
// days, we are working on an interval value hence we cannot guess
// which will be the exact impacted month duration in days. This will
// create a deviation where the interval may be more or less a few
// days than the user expected, it's an acceptable deviation.
if ('date' !== $format && $delta->s) {
// Please, never use seconds...
$delta = $delta->s + $delta->i * 60 + $delta->h * 3600 + $delta->d * 86400 + $delta->m * 2592000 + $delta->y * 31104000;
$unit = 'second';
} elseif ('date' !== $format && $delta->i) {
$delta = $delta->i + $delta->h * 60 + $delta->d * 1440 + $delta->m * 43200 + $delta->y * 518400;
$unit = 'minute';
} elseif ('date' !== $format && $delta->h) {
$delta = $delta->h + $delta->d * 24 + $delta->m * 720 + $delta->y * 8640;
$unit = 'hour';
} elseif ($delta->d) {
$delta = $delta->d + $delta->m * 30 + $delta->y * 360;
$unit = 'day';
} elseif ($delta->m) {
$delta = $delta->m + $delta->y * 12;
$unit = 'month';
} elseif ($delta->y) {
$delta = $delta->y;
$unit = 'year';
} else {
throw new \InvalidArgumentException("'delta' option interval is empty.");
}

$expr = $update->expression();
$columnExpr = $expr->column($this->columnName, $this->tableName);

$this->anonymizeWithDeltaAndReferenceDate($update, $format, $columnExpr, $delta, $unit);
}

private function anonymizeWithDeltaAndReferenceDate(Update $update, string $format, mixed $referenceDate, int $delta, string $unit): void
{
$expr = $update->expression();

$platform = $this->connection->getDatabasePlatform();
if ($platform instanceof AbstractMySQLPlatform) {
$type = 'date' === $format ? 'date' : 'datetime';
} else {
$type = 'date' === $format ? 'date' : 'timestamp';
}

$dateAddExpr = $expr->dateAdd(
$referenceDate,
$expr->intervalUnit(
// This additional cast is necessary for SQLite only because it
// will mix up int addition and string concatenation, causing
// the interval string to be malformed. For all other vendors,
// it's a no-op.
$expr->cast(
$this->getRandomIntExpression(
$delta,
0 - $delta,
),
'text'
),
$unit
),
);

if ($platform instanceof SqlitePlatform) {
$update->set(
$this->columnName,
$this->getSetIfNotNullExpression(
$dateAddExpr,
)
);
} else {
$update->set(
$this->columnName,
$this->getSetIfNotNullExpression(
$expr->cast(
$dateAddExpr,
$type,
)
)
);
}
}

private function parseDate(string $value, string $option): \DateTimeImmutable
{
try {
return new \DateTimeImmutable($value);
} catch (\Throwable) {
throw new \InvalidArgumentException(\sprintf(
"'%s' value is invalid, expected a valid date time format, got '%s'",
$option,
$value,
));
}
}
}
Loading

0 comments on commit 48691a5

Please sign in to comment.