Skip to content

Commit

Permalink
Swap sqlite cursor with dictionary and set data structures (ambv#24)
Browse files Browse the repository at this point in the history
1. Use 2 new data structures:
-paths (set) contains all the files in the actual filesystem
-hashes (dictionary) substitute the sqlite query with dict[hash] = set(db paths)

2. Minimal unitary tests created with bats (bash script)

See ambv#23 for details.
  • Loading branch information
liloman authored and ambv committed Mar 3, 2017
1 parent 6b4a1fd commit a8e5262
Show file tree
Hide file tree
Showing 3 changed files with 310 additions and 24 deletions.
53 changes: 29 additions & 24 deletions src/bitrot.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
`follow_links` is False (the default). All entries present in `expected`
must be files (can't be directories or symlinks).
"""
paths = []
paths = set()
total_size = 0
for path, _, files in os.walk(directory):
for f in files:
Expand All @@ -129,9 +129,8 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
else:
if not stat.S_ISREG(st.st_mode) or p in ignored:
continue
paths.append(p)
paths.add(p)
total_size += st.st_size
paths.sort()
return paths, total_size


Expand Down Expand Up @@ -180,12 +179,13 @@ def run(self):
errors = []
current_size = 0
missing_paths = self.select_all_paths(cur)
hashes = self.select_all_hashes(cur)
paths, total_size = list_existing_paths(
b'.', expected=missing_paths, ignored={bitrot_db, bitrot_sha512},
follow_links=self.follow_links,
)

for p in paths:
for p in sorted(paths):
p_uni = p.decode(FSENCODING)
try:
st = os.stat(p)
Expand Down Expand Up @@ -227,7 +227,7 @@ def run(self):
row = cur.fetchone()
if not row:
stored_path = self.handle_unknown_path(
cur, p_uni, new_mtime, new_sha1,
cur, p_uni, new_mtime, new_sha1, paths, hashes
)
self.maybe_commit(conn)

Expand Down Expand Up @@ -291,6 +291,16 @@ def select_all_paths(self, cur):
row = cur.fetchone()
return result

def select_all_hashes(self, cur):
result = {}
cur.execute('SELECT hash, path FROM bitrot')
row = cur.fetchone()
while row:
rhash, rpath = row
result.setdefault(rhash, set()).add(rpath)
row = cur.fetchone()
return result

def report_progress(self, current_size, total_size):
size_fmt = '\r{:>6.1%}'.format(current_size/(total_size or 1))
if size_fmt == self._last_reported_size:
Expand Down Expand Up @@ -345,37 +355,32 @@ def report_done(
if self.test and self.verbosity:
print('warning: database file not updated on disk (test mode).')

def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1):
def handle_unknown_path(self, cur, new_path, new_mtime, new_sha1, paths, hashes):
"""Either add a new entry to the database or update the existing entry
on rename.
Returns `new_path` if the entry was indeed new or the `stored_path` (e.g.
outdated path) if there was a rename.
"""
cur.execute('SELECT mtime, path, timestamp FROM bitrot WHERE hash=?',
(new_sha1,))
rows = cur.fetchall()
for row in rows:
stored_mtime, stored_path, stored_ts = row
if os.path.exists(stored_path):
# file still exists, move on
continue

try: # if the path isn't in the database
found = [path for path in hashes[new_sha1] if path not in paths]
renamed = found.pop()
# update the path in the database
cur.execute(
'UPDATE bitrot SET mtime=?, path=?, timestamp=? WHERE path=?',
(new_mtime, new_path, ts(), stored_path),
(new_mtime, new_path, ts(), renamed),
)

return stored_path

# no rename, just a new file with the same hash
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(new_path, new_mtime, new_sha1, ts()),
)
return new_path

return renamed
# From hashes[new_sha1] or found.pop()
except (KeyError,IndexError):
cur.execute(
'INSERT INTO bitrot VALUES (?, ?, ?, ?)',
(new_path, new_mtime, new_sha1, ts()),
)
return new_path

def get_path(directory=b'.', ext=b'db'):
"""Compose the path to the selected bitrot file."""
Expand Down
220 changes: 220 additions & 0 deletions tests/test-bitrot.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/env bats

load test_helper


#change it to your testing bitrot
cmd=~/Clones/bitrot/src/bitrot.py

# cmd=bitrot

test_dir=/tmp/bitrot_dir-$USER
mkdir -p $test_dir
cd $test_dir || exit

###########
# BASIC #
###########

@test "bitrot detects new files in a tree dir" {
mkdir -p notemptydirs/dir2/
touch notemptydirs/dir2/new-file-{a,b}.txt
echo $RANDOM >> notemptydirs/dir2/new-file-b.txt
run $cmd -v
# check_fail "${lines[@]}"

(( $status == 0 ))
# [[ ${lines[0]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[1]} = "2 entries in the database. 2 entries new:" ]]
[[ ${lines[2]} = " ./notemptydirs/dir2/new-file-a.txt" ]]
[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-b.txt" ]]
[[ ${lines[4]} = "Updating bitrot.sha512... done." ]]

}


@test "bitrot detects modified files in a tree dir" {
sleep 1
echo $RANDOM >> notemptydirs/dir2/new-file-a.txt
run $cmd -v
# check_fail "${lines[@]}"

(( $status == 0 ))
[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]]
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "2 entries in the database. 1 entries updated:" ]]
[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-a.txt" ]]
[[ ${lines[4]} = "Updating bitrot.sha512... done." ]]

}

@test "bitrot detects renamed files in a tree dir" {
sleep 1
mv notemptydirs/dir2/new-file-a.txt notemptydirs/dir2/new-file-a.txt2
run $cmd -v
# check_fail "${lines[@]}"

(( $status == 0 ))
[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]]
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "2 entries in the database. 1 entries renamed:" ]]
[[ ${lines[3]} = " from ./notemptydirs/dir2/new-file-a.txt to ./notemptydirs/dir2/new-file-a.txt2" ]]
[[ ${lines[4]} = "Updating bitrot.sha512... done." ]]

}

@test "bitrot detects delete files in a tree dir" {
sleep 1
rm notemptydirs/dir2/new-file-a.txt2
run $cmd -v
# check_fail "${lines[@]}"

(( $status == 0 ))
[[ ${lines[0]} = "Checking bitrot.db integrity... ok." ]]
# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "1 entries in the database. 1 entries missing:" ]]
[[ ${lines[3]} = " ./notemptydirs/dir2/new-file-a.txt2" ]]
[[ ${lines[4]} = "Updating bitrot.sha512... done." ]]

}


@test "bitrot detects new files and modified in a tree dir " {
sleep 1
touch more-files-{a,b,c,d,e,f,g}.txt
echo $RANDOM >> notemptydirs/dir2/new-file-b.txt
run $cmd -v
#check_fail "${lines[@]}"

(( $status == 0 ))

# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "8 entries in the database. 7 entries new:" ]]
[[ ${lines[3]} = " ./more-files-a.txt" ]]
[[ ${lines[4]} = " ./more-files-b.txt" ]]
[[ ${lines[5]} = " ./more-files-c.txt" ]]
[[ ${lines[6]} = " ./more-files-d.txt" ]]
[[ ${lines[7]} = " ./more-files-e.txt" ]]
[[ ${lines[8]} = " ./more-files-f.txt" ]]
[[ ${lines[9]} = " ./more-files-g.txt" ]]
[[ ${lines[10]} = "1 entries updated:" ]]
[[ ${lines[11]} = " ./notemptydirs/dir2/new-file-b.txt" ]]
[[ ${lines[12]} = "Updating bitrot.sha512... done." ]]
}

@test "bitrot detects new files, modified, deleted and moved in a tree dir " {
sleep 1
for fil in {a,b,c,d,e,f,g}; do
echo $RANDOM >> notemptydirs/pl-more-files-$fil.txt
done
echo $RANDOM >> notemptydirs/dir2/new-file-b.txt
mv more-files-a.txt more-files-a.txt2
rm more-files-g.txt
run $cmd -v

(( $status == 0 ))

# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "14 entries in the database. 7 entries new:" ]]
[[ ${lines[3]} = " ./notemptydirs/pl-more-files-a.txt" ]]
[[ ${lines[4]} = " ./notemptydirs/pl-more-files-b.txt" ]]
[[ ${lines[5]} = " ./notemptydirs/pl-more-files-c.txt" ]]
[[ ${lines[6]} = " ./notemptydirs/pl-more-files-d.txt" ]]
[[ ${lines[7]} = " ./notemptydirs/pl-more-files-e.txt" ]]
[[ ${lines[8]} = " ./notemptydirs/pl-more-files-f.txt" ]]
[[ ${lines[9]} = " ./notemptydirs/pl-more-files-g.txt" ]]
[[ ${lines[10]} = "1 entries updated:" ]]
[[ ${lines[11]} = " ./notemptydirs/dir2/new-file-b.txt" ]]
[[ ${lines[12]} = "1 entries renamed:" ]]
[[ ${lines[13]} = " from ./more-files-a.txt to ./more-files-a.txt2" ]]
[[ ${lines[14]} = "1 entries missing:" ]]
[[ ${lines[15]} = " ./more-files-g.txt" ]]
[[ ${lines[16]} = "Updating bitrot.sha512... done." ]]
}


@test "bitrot detects new files, modified, deleted and moved in a tree dir 2" {
sleep 1
for fil in {a,b,c,d,e,f,g}; do
echo $RANDOM >> notemptydirs/pl2-more-files-$fil.txt
done
echo $RANDOM >> notemptydirs/pl-more-files-a.txt

mv notemptydirs/pl-more-files-b.txt notemptydirs/pl-more-files-b.txt2
cp notemptydirs/pl-more-files-g.txt notemptydirs/pl2-more-files-g.txt2
cp notemptydirs/pl-more-files-d.txt notemptydirs/pl2-more-files-d.txt2

rm more-files-f.txt notemptydirs/pl-more-files-c.txt

run $cmd -v

# check_fail "${lines[@]}"

(( $status == 0 ))

# [[ ${lines[1]} = "Finished. 0.00 MiB of data read. 0 errors found." ]]
[[ ${lines[2]} = "21 entries in the database. 9 entries new:" ]]
[[ ${lines[3]} = " ./notemptydirs/pl2-more-files-a.txt" ]]
[[ ${lines[4]} = " ./notemptydirs/pl2-more-files-b.txt" ]]
[[ ${lines[5]} = " ./notemptydirs/pl2-more-files-c.txt" ]]
[[ ${lines[6]} = " ./notemptydirs/pl2-more-files-d.txt" ]]
[[ ${lines[7]} = " ./notemptydirs/pl2-more-files-d.txt2" ]]
[[ ${lines[8]} = " ./notemptydirs/pl2-more-files-e.txt" ]]
[[ ${lines[9]} = " ./notemptydirs/pl2-more-files-f.txt" ]]
[[ ${lines[10]} = " ./notemptydirs/pl2-more-files-g.txt" ]]
[[ ${lines[11]} = " ./notemptydirs/pl2-more-files-g.txt2" ]]
[[ ${lines[12]} = "1 entries updated:" ]]
[[ ${lines[13]} = " ./notemptydirs/pl-more-files-a.txt" ]]
[[ ${lines[14]} = "1 entries renamed:" ]]
[[ ${lines[15]} = " from ./notemptydirs/pl-more-files-b.txt to ./notemptydirs/pl-more-files-b.txt2" ]]
[[ ${lines[16]} = "2 entries missing:" ]]
[[ ${lines[17]} = " ./more-files-f.txt" ]]
[[ ${lines[18]} = " ./notemptydirs/pl-more-files-c.txt" ]]
[[ ${lines[19]} = "Updating bitrot.sha512... done." ]]
}


@test "bitrot can operate with 3278 files easily in a dir" {
sleep 1
mkdir -p alotfiles/here; cd alotfiles/here
#create a 320KB file
dd if=/dev/urandom of=masterfile bs=1 count=327680
#split it in 3277 files (instantly) + masterfile = 3278
split -b 100 -a 10 masterfile
cd $test_dir
run $cmd

(( $status == 0 ))
[[ ${lines[2]} = "3299 entries in the database, 3278 new, 0 updated, 0 renamed, 0 missing." ]]

}

@test "bitrot can operate with 3278 files easily in a dir 2 " {
sleep 1
mv alotfiles/here alotfiles/here-moved
run $cmd
#check_fail "${lines[@]}"

(( $status == 0 ))
[[ ${lines[2]} = "3299 entries in the database, 0 new, 0 updated, 3278 renamed, 0 missing." ]]

}

@test "bitrot can detetect a bitrot in a dir ! " {
sleep 1
generate_bitrot ./bitrot-file 10 2 $cmd
run $cmd -q

#check_fail "${lines[@]}"

(( $status == 1 ))
[[ ${lines[0]} = *"error: SHA1 mismatch for ./bitrot-file: expected"* ]]
[[ ${lines[1]} = "error: There were 1 errors found." ]]
}


@test "Clean everything" {
run chmod -f a+w *
\rm -rf * $test_dir $BITROT_BACKUPS
}

61 changes: 61 additions & 0 deletions tests/test_helper.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env bash

# LC_ALL=en_US.UTF-8
# LANGUAGE=en_US.UTF-8
LANG=C

check_fail() {
local temp=/tmp/bats.log
> $temp
for line; do
echo "$line" >> $temp
done
# cat /tmp/.bitrot.log >> $temp
}


generate_bitrot() {
local dest=$1 temp=/tmp/temp-base
local -i count=$(($2*100)) percent=${3:-5}
local cmd=$4
mkdir -p "${dest%/*}"
local dir_base=${dest%%/*}
touch "$dest" $temp
#let's make sure they shared the same timestamp
touch "$dest" -r $temp

dd if=/dev/zero of="$dest" bs=1k count=$count &>/dev/null
run $cmd
#modify it and change modify date to base-file, simulate real bitrot so
dd seek=1k if=/dev/urandom of="$dest" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null
touch "$dest" -r $temp
\rm -f $tmp
run $cmd
}

generate_bitrots() {
local dest=$1 dest2=$2 temp=/tmp/temp-base
local -i count=$(($3*100)) percent=${4:-5}
mkdir -p "${dest%/*}"
mkdir -p "${dest2%/*}"
local dir_base=${dest%/*}
local dir_base2=${dest2%/*}
touch "$dest2" "$dest" $temp
#let's make sure they shared the same timestamp
touch "$dest" -r $temp
touch "$dest2" -r $temp

dd if=/dev/zero of="$dest" bs=1k count=$count &>/dev/null
dd if=/dev/zero of="$dest2" bs=1k count=$count &>/dev/null
run $r "$dir_base" "$dir_base2"
#modify it and change modify date to base-file, simulate bitrot so
dd seek=1k if=/dev/urandom of="$dest" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null
dd seek=1k if=/dev/urandom of="$dest2" bs=1k count=$((count*percent/100)) conv=notrunc &>/dev/null
touch "$dest" -r $temp
touch "$dest2" -r $temp
\rm -f $tmp
echo $status > /tmp/status
run $r "$dir_base" "$dir_base2"
echo $status >> /tmp/status
}

0 comments on commit a8e5262

Please sign in to comment.