Spaces:
Paused
Paused
| emilia_root=$1 # /data/scratch/pyp/datasets/emilia/downloads | |
| for file in ${emilia_root}/* ; do | |
| # Check if gzip compressed archive | |
| if file "$file" | grep -q 'gzip compressed data'; then | |
| # Extract string of form 'was "EN_B00100.tar"'' from the output of the file command to keep EN_B00100 | |
| filename=$(file "$file" | grep -oP '(?<=was ")[^"]*' | sed 's/\.tar$//') | |
| # Get the file size | |
| size=$(du -sh "$file" | cut -f1) | |
| original_filename=$(basename "$file") | |
| # Get URL string from corresponding JSON file with same basename | |
| json_file=$file.json | |
| if [ -f "$json_file" ]; then | |
| # url=$(jq -r '.url' "$json_file") # jq is not installed on the server | |
| url=$(python3 -c "import sys, json; print(json.load(open('$json_file'))['url'])") | |
| else | |
| url="N/A" | |
| fi | |
| # Compute SHA256 hash of the file | |
| hash=$(python sha256hash.py "$file") | |
| echo $original_filename | |
| # Write filename, size, hash, original filename, URL to output file | |
| echo "$filename, $size, $hash, $original_filename, $url" >> file_log.txt | |
| fi | |
| done | |
| # Sort the output file by filename | |
| sort -o file_log.txt file_log.txt |