You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import os
import modin.pandas as pd # Use Modin with Ray
import numpy as np
from multiprocessing import Pool, cpu_count
import time
# Setup environment to use Ray as the backend for Modin
os.environ["MODIN_ENGINE"] = "ray" # Use Ray as the execution engine
# Define the path for inconsistency log
inconsistency_log_path = r'C:\dataInconsistencies.txt'
# Create log file if it doesn't exist
if not os.path.exists(inconsistency_log_path):
with open(inconsistency_log_path, 'w') as f:
f.write('Inconsistencies in Binance Data Files\n')
f.write('------------------------------------\n')
# Function to check ID continuity in a specific partition
def check_id_continuity(partition_data, start_idx, end_idx, previous_last_id, partition_index, filename):
print(f"[DEBUG] Partition {partition_index} starting, processing rows {start_idx} to {end_idx}...")
inconsistencies = []
# Check continuity within the partition
for i in range(start_idx + 1, end_idx):
current_id = partition_data[i]
previous_id = partition_data[i - 1]
if current_id != previous_id + 1:
inconsistencies.append(
f"File: {filename}, Partition: {partition_index}, Row Index: {i}, ID Discontinuity: {previous_id} -> {current_id}\n")
# Check continuity across partitions (from previous partition's last ID)
if previous_last_id is not None and partition_data[start_idx] != previous_last_id + 1:
inconsistencies.append(
f"File: {filename}, Partition: {partition_index}, Cross-partition Discontinuity: {previous_last_id} -> {partition_data[start_idx]}\n")
print(f"[DEBUG] Partition {partition_index} finished.")
return inconsistencies
# Worker function for multiprocessing
def process_partition(args):
return check_id_continuity(*args)
# Function to process one CSV file by loading it into memory and dividing it among processes
def process_file_in_memory(filename):
print(f"[DEBUG] Starting to load file: {filename}")
# Record the start time for loading
load_start_time = time.time()
# Use Modin's pandas to parallelize the CSV loading
print(f"[DEBUG] Loading CSV file using Modin with Ray: {filename}")
try:
data = pd.read_csv(filename, usecols=['id'], dtype={'id': np.uint64})
except Exception as e:
print(f"[ERROR] Failed to load file: {filename}. Exception: {e}")
return
# Get the number of rows
total_rows = len(data)
print(f"[DEBUG] File {filename} loaded successfully. Total rows: {total_rows}")
# Convert the 'id' column to a numpy array for fast access
id_data = data['id'].values
load_end_time = time.time()
print(f"[DEBUG] File loaded into memory in {load_end_time - load_start_time:.2f} seconds. Starting to process partitions...")
# Define number of partitions (16 for 16 cores or limit to available CPU cores)
num_cores = min(cpu_count(), 16) # Limit to 16 cores or available cores
partition_size = total_rows // num_cores
print(f"[DEBUG] Number of partitions: {num_cores}, partition size: {partition_size}")
# Prepare arguments for each process
partitions = []
for i in range(num_cores):
start_idx = i * partition_size
# Last partition will take any remaining rows
end_idx = (i + 1) * partition_size if i != num_cores - 1 else total_rows
previous_last_id = id_data[start_idx - 1] if i > 0 else None # None for the first partition
partitions.append((id_data, start_idx, end_idx, previous_last_id, i, filename))
# Use multiprocessing to check each partition in parallel
print(f"[DEBUG] Starting multiprocessing for partition processing...")
try:
with Pool(num_cores) as pool:
results = pool.map(process_partition, partitions)
except Exception as e:
print(f"[ERROR] Error in multiprocessing. Exception: {e}")
return
# Write all inconsistencies to the log file
print(f"[DEBUG] Writing inconsistencies to log file: {inconsistency_log_path}")
try:
with open(inconsistency_log_path, 'a') as log_file:
for result in results:
for inconsistency in result:
log_file.write(inconsistency)
except Exception as e:
print(f"[ERROR] Error writing to log file. Exception: {e}")
return
# Print completion message and time taken
print(f"[DEBUG] Finished processing file {filename}. Total rows: {total_rows}")
print(f"[DEBUG] File processing completed in {time.time() - load_start_time:.2f} seconds.")
# Function to process all CSV files in a given folder
def process_all_files_in_folder(folder_path):
# Get all CSV files in the folder
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
for csv_file in csv_files:
print(f"[DEBUG] Processing file: {csv_file}")
process_file_in_memory(csv_file)
if __name__ == "__main__":
# Define the path to the folder containing CSV files
base_folder_path = r'G:\BINANCE DATAA'
# Process all folders and CSV files inside the base folder
for folder_name in os.listdir(base_folder_path):
folder_path = os.path.join(base_folder_path, folder_name)
if os.path.isdir(folder_path): # Ensure we are processing directories
print(f"[DEBUG] Processing folder: {folder_path}")
process_all_files_in_folder(folder_path)
The text was updated successfully, but these errors were encountered:
MISSING TRADES DATA.. EXAMPLE: under UM/ MONTHLY DATA FILE: BTCUSDT-trades-2023-02
Inconsistencies in Binance Data Files
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 401416, ID Discontinuity: 3249558288 -> 3249558290
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 423405, ID Discontinuity: 3249580278 -> 3249580280
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1213526, ID Discontinuity: 3250370400 -> 3250370402
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1322605, ID Discontinuity: 3250479480 -> 3250479482
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1334772, ID Discontinuity: 3250491648 -> 3250491650
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1334856, ID Discontinuity: 3250491733 -> 3250491735
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1334893, ID Discontinuity: 3250491771 -> 3250491773
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1335968, ID Discontinuity: 3250492847 -> 3250492849
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1359486, ID Discontinuity: 3250516366 -> 3250516368
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1359853, ID Discontinuity: 3250516734 -> 3250516736
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1707767, ID Discontinuity: 3250864649 -> 3250864651
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1756978, ID Discontinuity: 3250913861 -> 3250913863
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1756998, ID Discontinuity: 3250913882 -> 3250913885
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757149, ID Discontinuity: 3250914035 -> 3250914037
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757195, ID Discontinuity: 3250914082 -> 3250914084
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757205, ID Discontinuity: 3250914093 -> 3250914095
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757224, ID Discontinuity: 3250914113 -> 3250914115
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757282, ID Discontinuity: 3250914172 -> 3250914174
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757373, ID Discontinuity: 3250914264 -> 3250914266
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757386, ID Discontinuity: 3250914278 -> 3250914280
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757451, ID Discontinuity: 3250914344 -> 3250914346
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757470, ID Discontinuity: 3250914364 -> 3250914366
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757471, ID Discontinuity: 3250914366 -> 3250914369
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1757472, ID Discontinuity: 3250914369 -> 3250914371
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1760731, ID Discontinuity: 3250917629 -> 3250917631
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1760732, ID Discontinuity: 3250917631 -> 3250917633
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1760733, ID Discontinuity: 3250917633 -> 3250917635
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1760746, ID Discontinuity: 3250917647 -> 3250917650
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1760756, ID Discontinuity: 3250917659 -> 3250917661
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1760757, ID Discontinuity: 3250917661 -> 3250917663
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1760782, ID Discontinuity: 3250917687 -> 3250917689
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1760853, ID Discontinuity: 3250917759 -> 3250917761
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 1762330, ID Discontinuity: 3250919237 -> 3250919239
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 2073222, ID Discontinuity: 3251230130 -> 3251230132
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 2078484, ID Discontinuity: 3251235393 -> 3251235395
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 2211408, ID Discontinuity: 3251368318 -> 3251368320
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 2468476, ID Discontinuity: 3251625387 -> 3251625389
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3128867, ID Discontinuity: 3252285779 -> 3252285781
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3132930, ID Discontinuity: 3252289843 -> 3252289845
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3147985, ID Discontinuity: 3252304899 -> 3252304901
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3456060, ID Discontinuity: 3252612975 -> 3252612977
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3456288, ID Discontinuity: 3252613204 -> 3252613206
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3480743, ID Discontinuity: 3252637660 -> 3252637662
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3721400, ID Discontinuity: 3252878318 -> 3252889108
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3774808, ID Discontinuity: 3252942515 -> 3252942517
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3776221, ID Discontinuity: 3252943929 -> 3252943931
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3852381, ID Discontinuity: 3253020090 -> 3253020092
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3861061, ID Discontinuity: 3253028771 -> 3253028773
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3872550, ID Discontinuity: 3253040261 -> 3253040263
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3897112, ID Discontinuity: 3253064824 -> 3253064826
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3897684, ID Discontinuity: 3253065397 -> 3253065399
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3898113, ID Discontinuity: 3253065827 -> 3253065829
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3903229, ID Discontinuity: 3253070944 -> 3253070946
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3908085, ID Discontinuity: 3253075801 -> 3253075803
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3916291, ID Discontinuity: 3253084008 -> 3253084010
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3916336, ID Discontinuity: 3253084054 -> 3253084056
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3917027, ID Discontinuity: 3253084746 -> 3253084748
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3918026, ID Discontinuity: 3253085746 -> 3253085748
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3918663, ID Discontinuity: 3253086384 -> 3253086386
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957508, ID Discontinuity: 3253125230 -> 3253125232
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957549, ID Discontinuity: 3253125272 -> 3253125275
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957553, ID Discontinuity: 3253125278 -> 3253125280
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957555, ID Discontinuity: 3253125281 -> 3253125283
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957574, ID Discontinuity: 3253125301 -> 3253125305
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957594, ID Discontinuity: 3253125324 -> 3253125326
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957596, ID Discontinuity: 3253125327 -> 3253125329
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957600, ID Discontinuity: 3253125332 -> 3253125335
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957606, ID Discontinuity: 3253125340 -> 3253125342
File: G:\BINANCE DATAA\BTCUSDT-trades-2023-02\BTCUSDT-trades-2023-02.csv, Partition: 0, Row Index: 3957610, ID Discontinuity: 3253125345 -> 3253125347
CODE:
The text was updated successfully, but these errors were encountered: