Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- from mpl_toolkits.mplot3d import Axes3D
- # Step 1: Load Data Efficiently
- def load_data(filepath):
- headers = ['rsid', 'chromosome', 'position', 'genotype']
- # Specify dtype to improve read performance
- dtypes = {'rsid': str, 'chromosome': 'category', 'position': int, 'genotype': str}
- data = pd.read_csv(filepath, delimiter='\t', names=headers, comment='#', dtype=dtypes, low_memory=False)
- print(data.head())
- return data
- # Step 2: Preprocess and Encode Data
- def preprocess_data(data):
- # Replace '--' with NaN directly in read_csv using na_values
- data['genotype'].replace('--', np.nan, inplace=True)
- data.dropna(subset=['genotype'], inplace=True)
- data['genotype_value'] = data['genotype'].apply(lambda x: len(x))
- # Convert 'chromosome' to numeric if possible, otherwise to category which is handled later in plotting
- data['chromosome'] = pd.to_numeric(data['chromosome'], errors='coerce')
- if data['chromosome'].isnull().any():
- data['chromosome'] = pd.Categorical(data['chromosome'].fillna('X')).codes
- return data
- # Step 3: Optimized Plotting Function
- def plot_data(data):
- fig = plt.figure(figsize=(10, 8))
- ax = fig.add_subplot(111, projection='3d')
- # Downsampling data for faster rendering; adjust the step size as needed
- step = 10 # Increase step size to speed up or decrease for more detail
- # Adjusting the color map to 'Reds' and normalizing the color range
- norm = plt.Normalize(data['genotype_value'].min(), data['genotype_value'].max())
- scatter = ax.scatter(data['chromosome'][::step], data['position'][::step], data['genotype_value'][::step],
- c=data['genotype_value'][::step], cmap='Reds', norm=norm,
- marker='o', alpha=0.6, s=50)
- ax.set_xlabel('Chromosome')
- ax.set_ylabel('Position')
- ax.set_zlabel('Genotype Value')
- ax.set_title('3D Visualization of DNA Genotypes')
- cbar = fig.colorbar(scatter, ax=ax, pad=0.1)
- cbar.set_label('Genotype Value')
- plt.show()
- # Main function
- def main():
- filepath = '/file.txt' # Specify the actual file path
- data = load_data(filepath)
- processed_data = preprocess_data(data)
- plot_data(processed_data)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement