大家好,我是py小陳!作為一名經(jīng)常和數(shù)據(jù)打交道的程序員,處理CSV文件是日常工作中的必修課。今天給大家?guī)?lái)一個(gè)實(shí)用的CSV文件處理工具,讓你告別繁瑣的Excel操作!
一、環(huán)境準(zhǔn)備
# pip install pandas numpy openpyxl tqdm
import pandas as pd
import numpy as np
from tqdm import tqdm
import csv
import os
from datetime import datetime
二、基礎(chǔ)處理功能
1. CSV讀寫(xiě)器
class CSVHandler:
def __init__(self, encoding='utf-8'):
self.encoding = encoding
self.data = None
def read_csv(self, file_path, chunk_size=None):
'''
智能讀取CSV文件
支持大文件分塊讀取
'''
try:
if chunk_size:
return pd.read_csv(file_path,
encoding=self.encoding,
chunksize=chunk_size)
self.data = pd.read_csv(file_path, encoding=self.encoding)
return self.data
except UnicodeDecodeError:
# 嘗試其他編碼
for enc in ['gbk', 'gb2312', 'utf-8-sig']:
try:
self.data = pd.read_csv(file_path, encoding=enc)
self.encoding = enc
return self.data
except:
continue
raise Exception('無(wú)法識(shí)別文件編碼')
def save_csv(self, data, output_path, index=False):
'''保存CSV文件'''
data.to_csv(output_path, encoding=self.encoding, index=index)
print(f'文件已保存至: {output_path}')
2. 數(shù)據(jù)清洗工具
class DataCleaner:
@staticmethod
def remove_duplicates(df, subset=None):
'''刪除重復(fù)行'''
original_len = len(df)
df = df.drop_duplicates(subset=subset)
removed = original_len - len(df)
print(f'已刪除 {removed} 行重復(fù)數(shù)據(jù)')
return df
@staticmethod
def fill_missing_values(df, method='mean'):
'''處理缺失值'''
if method == 'mean':
return df.fillna(df.mean())
elif method == 'mode':
return df.fillna(df.mode().iloc[0])
elif method == 'ffill':
return df.fillna(method='ffill')
return df.fillna(0)
@staticmethod
def remove_invalid_rows(df, conditions):
'''根據(jù)條件刪除無(wú)效行'''
original_len = len(df)
for column, condition in conditions.items():
if callable(condition):
df = df[condition(df[column])]
else:
df = df[df[column] == condition]
removed = original_len - len(df)
print(f'已刪除 {removed} 行無(wú)效數(shù)據(jù)')
return df
三、高級(jí)數(shù)據(jù)處理
1. 數(shù)據(jù)轉(zhuǎn)換工具
class DataTransformer:
@staticmethod
def convert_types(df, type_dict):
'''
批量轉(zhuǎn)換數(shù)據(jù)類(lèi)型
type_dict: {'column_name': 'target_type'}
'''
for column, dtype in type_dict.items():
try:
df[column] = df[column].astype(dtype)
except Exception as e:
print(f'轉(zhuǎn)換列 {column} 失敗: {str(e)}')
return df
@staticmethod
def apply_functions(df, func_dict):
'''
批量應(yīng)用函數(shù)處理
func_dict: {'column_name': function}
'''
for column, func in func_dict.items():
df[column] = df[column].apply(func)
return df
@staticmethod
def split_column(df, column, new_columns, separator):
'''拆分列'''
split_data = df[column].str.split(separator, expand=True)
split_data.columns = new_columns
return pd.concat([df, split_data], axis=1)
2. 數(shù)據(jù)分析工具
class DataAnalyzer:
@staticmethod
def generate_summary(df):
'''生成數(shù)據(jù)概要'''
summary = {
'總行數(shù)': len(df),
'列數(shù)': len(df.columns),
'數(shù)據(jù)類(lèi)型': df.dtypes.to_dict(),
'缺失值統(tǒng)計(jì)': df.isnull().sum().to_dict(),
'數(shù)值列統(tǒng)計(jì)': df.describe().to_dict()
}
return summary
@staticmethod
def column_statistics(df, column):
'''單列詳細(xì)統(tǒng)計(jì)'''
if pd.api.types.is_numeric_dtype(df[column]):
return {
'平均值': df[column].mean(),
'中位數(shù)': df[column].median(),
'標(biāo)準(zhǔn)差': df[column].std(),
'最大值': df[column].max(),
'最小值': df[column].min()
}
else:
return {
'唯一值數(shù)量': df[column].nunique(),
'最常見(jiàn)值': df[column].mode().iloc[0],
'值分布': df[column].value_counts().head()
}
四、大文件處理器
class LargeFileHandler:
def __init__(self, chunk_size=10000):
self.chunk_size = chunk_size
def process_large_file(self, input_path, output_path, process_func):
'''
分塊處理大文件
'''
print('開(kāi)始處理大文件...')
chunks = pd.read_csv(input_path, chunksize=self.chunk_size)
# 處理第一塊來(lái)獲取列信息
first_chunk = next(chunks)
processed_first = process_func(first_chunk)
processed_first.to_csv(output_path, mode='w', index=False)
# 處理剩余數(shù)據(jù)塊
for chunk in tqdm(chunks):
processed_chunk = process_func(chunk)
processed_chunk.to_csv(output_path, mode='a',
header=False, index=False)
print(f'文件處理完成,已保存至: {output_path}')
五、完整應(yīng)用示例
class CSVProcessor:
def __init__(self):
self.handler = CSVHandler()
self.cleaner = DataCleaner()
self.transformer = DataTransformer()
self.analyzer = DataAnalyzer()
def process_file(self, input_path, output_path, config):
'''
一站式處理CSV文件
'''
print('開(kāi)始處理數(shù)據(jù)...')
# 1. 讀取文件
df = self.handler.read_csv(input_path)
# 2. 數(shù)據(jù)清洗
if config.get('remove_duplicates'):
df = self.cleaner.remove_duplicates(df)
if config.get('fill_missing'):
df = self.cleaner.fill_missing_values(df,
config['fill_method'])
# 3. 數(shù)據(jù)轉(zhuǎn)換
if config.get('type_conversions'):
df = self.transformer.convert_types(df,
config['type_conversions'])
# 4. 自定義處理
if config.get('custom_functions'):
df = self.transformer.apply_functions(df,
config['custom_functions'])
# 5. 保存結(jié)果
self.handler.save_csv(df, output_path)
# 6. 生成報(bào)告
return self.analyzer.generate_summary(df)
六、快速使用示例
# 10行代碼實(shí)現(xiàn)CSV快速處理
def quick_process_csv(input_file, output_file):
df = pd.read_csv(input_file)
df = df.drop_duplicates()
df = df.fillna(0)
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df.to_csv(output_file, index=False)
print(f'處理完成: {input_file} -> {output_file}')
# 使用示例
quick_process_csv('input.csv', 'output.csv')
實(shí)用技巧
- 使用適當(dāng)?shù)臄?shù)據(jù)類(lèi)型
結(jié)語(yǔ)
有了這個(gè)CSV處理工具,再也不用為處理大量表格數(shù)據(jù)發(fā)愁了!想學(xué)習(xí)的朋友記得關(guān)注!
我是py小陳,如果覺(jué)得今天的內(nèi)容對(duì)你有幫助,別忘了點(diǎn)個(gè)贊!有任何問(wèn)題都可以在評(píng)論區(qū)交流,讓我們一起進(jìn)步!下期再見(jiàn)!