""" Names: Christian Thede, Maxwell Hunter Date: 11/15/2023 Description: This program reads the BTCP.csv file and takes only selected columns It then hashes each row and writes the hashed result to a new file. It prints each hashed result to the console for debugging purposes. This program also records benchmarking information like memory usage, and CPU usage. """ import pandas as pd import hashlib import time import psutil def hash_data(data): h = hashlib.new("md5") #also used algorithms like sha256 and sha1 h.update(data) return h.digest() def hash_and_write_rows(file_path, selected_columns, output_file_path): # Record the start time start_time = time.time() try: with open(file_path, 'r') as f: pass except IOError: print(f"Error: File {file_path} does not appear to exist.") return with open(output_file_path, 'wb') as file: # Write selected columns to the output file file.write(','.join(selected_columns).encode('utf-8') + b'\n') # Record the start time for reading read_start_time = time.time() # Read only selected columns df = pd.read_csv(file_path, usecols=selected_columns) # Record the end time for reading read_end_time = time.time() """ # Print DataFrame for debugging print("DataFrame before hashing:") print(df) """ # Record the start time for hashing and writing hash_start_time = time.time() # Iterate through each row, hash it, and write to the output file for index, row in df.iterrows(): hashed_result = hash_data(row.to_csv(index=False).encode('utf-8')) file.write(hashed_result + b'\n') print(f"Hashed result for row {index}: {hashed_result.hex()}") # Record the end time for hashing and writing hash_end_time = time.time() # Record the end time end_time = time.time() # Calculate and print benchmarking information print("\nBenchmarking Information:") print(f"Total time: {end_time - start_time} seconds") print(f"Time for reading CSV: {read_end_time - read_start_time} seconds") print(f"Time for hashing and writing: {hash_end_time - hash_start_time} seconds") # Record memory usage memory_info = psutil.Process().memory_info() print(f"\nMemory Usage:") print(f"Memory used: {memory_info.rss / (1024 * 1024):.2f} MB") print(f"Memory peak: {memory_info.peak_wset / (1024 * 1024):.2f} MB") # Record CPU usage cpu_usage = psutil.cpu_percent() print(f"CPU usage: {cpu_usage}%") # File paths input_file_path = "docs\projects\project1DemoLibrary\BTCP.csv" output_file_path = "docs\projects\project1DemoLibrary\hashedUsers.txt" # Specific columns to hash selected_columns = ["id", "possibly_sensitive", "source", "text", "user_screen_name"] print(selected_columns) # Call the function hash_and_write_rows(input_file_path, selected_columns, output_file_path) print("\nHashing and writing complete.")