Programming/Python 3

중복 파일 제거 프로그램

Gadget 2025. 10. 5. 02:05

# pip install send2trash

 

선택한 디렉토리 하위 모든 파일에 대해 해시값 조사하여 중복된 것을 확인하여 삭제할 수 있는 프로그램입니다.

시간 단축을 위해, 처음, 중간, 끝 1KB 씩만 추출하여 해시값을 계산합니다.

import os
import hashlib
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import datetime
import send2trash
import platform
import subprocess

PARTIAL_READ_SIZE = 1024  # 1024 바이트로 변경

def fast_hash(filepath):
    try:
        filesize = os.path.getsize(filepath)
        if filesize == 0:
            return hashlib.md5(b'').hexdigest()
        m = hashlib.md5()
        with open(filepath, 'rb') as f:
            m.update(f.read(PARTIAL_READ_SIZE))
            mid_pos = filesize // 2
            f.seek(mid_pos)
            m.update(f.read(PARTIAL_READ_SIZE))
            end_pos = max(0, filesize - PARTIAL_READ_SIZE)
            f.seek(end_pos)
            m.update(f.read(PARTIAL_READ_SIZE))
        return m.hexdigest()
    except Exception:
        return None

class FileInfo:
    def __init__(self, path, size, mtime, hashval):
        self.path = path
        self.size = size
        self.mtime = mtime
        self.hashval = hashval

    def formatted_mtime(self):
        dt = datetime.datetime.fromtimestamp(self.mtime)
        return dt.strftime('%Y-%m-%d %H:%M:%S')

class App(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title('다중 디렉토리 해시 계산 및 파일 관리')
        self.geometry('960x700')

        self.dir_list = []  # 디렉토리 문자열 목록
        self.min_size_var = tk.StringVar(value='0')  # 최소 파일크기 바이트 단위
        self.running = False
        self.stop_requested = False

        self.create_widgets()
        self.file_info_list = []
        self.hash_groups = {}

    def create_widgets(self):
        # 디렉토리 선택 및 제어 박스
        frame_dir = ttk.Frame(self)
        frame_dir.pack(fill='both', padx=10, pady=5)

        label_dir = ttk.Label(frame_dir, text='디렉토리 목록:')
        label_dir.grid(row=0, column=0, sticky='w')

        self.list_dir = tk.Listbox(frame_dir, height=6, selectmode='extended', width=50)
        self.list_dir.grid(row=1, column=0, rowspan=4, sticky='nsew', padx=(0,10), pady=2)

        frame_dir_btns = ttk.Frame(frame_dir)
        frame_dir_btns.grid(row=1, column=1, sticky='ns')
        btn_add = ttk.Button(frame_dir_btns, text='추가', width=8, command=self.add_directory)
        btn_add.pack(pady=(0,5), fill='x')
        btn_remove = ttk.Button(frame_dir_btns, text='삭제', width=8, command=self.remove_directory)
        btn_remove.pack(fill='x')

        # 최소 파일 크기와 콤보박스 세로 배치
        frame_size = ttk.Frame(frame_dir)
        frame_size.grid(row=0, column=2, rowspan=5, sticky='nw')

        lbl_min_size = ttk.Label(frame_size, text='최소 파일 크기(Byte):')
        lbl_min_size.pack(anchor='w', pady=(0, 3))
        self.entry_min_size = ttk.Entry(frame_size, textvariable=self.min_size_var, width=15)
        self.entry_min_size.pack(anchor='w', pady=(0, 10))

        lbl_combo = ttk.Label(frame_size, text='크기 단위 선택:')
        lbl_combo.pack(anchor='w', pady=(0, 3))
        size_options = ['0', '1M', '10M', '100M', '1G']
        self.combo_size_unit = ttk.Combobox(frame_size, values=size_options, state='readonly', width=12)
        self.combo_size_unit.current(0)
        self.combo_size_unit.pack(anchor='w')
        self.combo_size_unit.bind('<<ComboboxSelected>>', self.on_size_unit_selected)

        # 실행(중단) 버튼
        self.btn_run = ttk.Button(frame_size, text='실행', width=10, command=self.toggle_run)
        self.btn_run.pack(anchor='w', pady=(20, 0))

        # 해시 그룹 트리뷰 (그리드 형태)
        frame_hash_group = ttk.Frame(self)
        frame_hash_group.pack(fill='both', padx=10, pady=5, expand=True)
        ttk.Label(frame_hash_group, text='중복 그룹 목록').pack(anchor='w')

        columns_hash = ('count', 'hash', 'rep_name', 'total_size')
        self.tree_hash_groups = ttk.Treeview(frame_hash_group, columns=columns_hash, show='headings', selectmode='browse')
        self.tree_hash_groups.pack(side='left', fill='both', expand=True)

        self.tree_hash_groups.heading('count', text='개수')
        self.tree_hash_groups.heading('hash', text='해시값')
        self.tree_hash_groups.heading('rep_name', text='대표파일명')
        self.tree_hash_groups.heading('total_size', text='크기 (Bytes)')

        self.tree_hash_groups.column('count', width=50, anchor='center')
        self.tree_hash_groups.column('hash', width=360, anchor='w')
        self.tree_hash_groups.column('rep_name', width=250, anchor='w')
        self.tree_hash_groups.column('total_size', width=120, anchor='e')

        scrollbar_hash = ttk.Scrollbar(frame_hash_group, orient='vertical', command=self.tree_hash_groups.yview)
        scrollbar_hash.pack(side='left', fill='y')
        self.tree_hash_groups.configure(yscrollcommand=scrollbar_hash.set)

        self.tree_hash_groups.bind('<<TreeviewSelect>>', self.on_hash_group_selected)

        # 파일 목록 및 스크롤바, 삭제/원복 버튼
        frame_file_list = ttk.Frame(self)
        frame_file_list.pack(fill='both', expand=True, padx=10, pady=5)

        ttk.Label(frame_file_list, text='파일 목록').pack(anchor='w')

        columns_files = ('index', 'path', 'size', 'mtime', 'hash')
        self.tree_files = ttk.Treeview(frame_file_list, columns=columns_files, show='headings', selectmode='extended')
        self.tree_files.pack(side='left', fill='both', expand=True)

        self.tree_files.heading('index', text='No.')
        self.tree_files.heading('path', text='파일 경로')
        self.tree_files.heading('size', text='크기 (Bytes)')
        self.tree_files.heading('mtime', text='수정일시')
        self.tree_files.heading('hash', text='해시값')

        self.tree_files.column('index', width=50, anchor='center')
        self.tree_files.column('path', width=450)
        self.tree_files.column('size', width=120, anchor='e')
        self.tree_files.column('mtime', width=150, anchor='center')
        self.tree_files.column('hash', width=320)

        scrollbar_files = ttk.Scrollbar(frame_file_list, orient='vertical', command=self.tree_files.yview)
        scrollbar_files.pack(side='left', fill='y')
        self.tree_files.configure(yscrollcommand=scrollbar_files.set)

        frame_buttons = ttk.Frame(frame_file_list)
        frame_buttons.pack(side='left', fill='y', padx=10)
        btn_delete = ttk.Button(frame_buttons, text='삭제', command=self.delete_selected_files)
        btn_delete.pack(fill='x', pady=5)
        btn_restore = ttk.Button(frame_buttons, text='원복', command=self.restore_selection)
        btn_restore.pack(fill='x', pady=5)

        self.tree_files.bind('<Double-1>', self.open_file_location)

    def on_size_unit_selected(self, event):
        val = self.combo_size_unit.get()
        mapping = {
            '0':'0',
            '1M': str(1_000_000),
            '10M': str(10_000_000),
            '100M': str(100_000_000),
            '1G': str(1_000_000_000)
        }
        if val in mapping:
            self.min_size_var.set(mapping[val])

    def add_directory(self):
        dir_selected = filedialog.askdirectory()
        if dir_selected and dir_selected not in self.dir_list:
            self.dir_list.append(dir_selected)
            self.list_dir.insert(tk.END, dir_selected)

    def remove_directory(self):
        selected_indices = list(self.list_dir.curselection())
        for i in reversed(selected_indices):
            del self.dir_list[i]
            self.list_dir.delete(i)

    def toggle_run(self):
        if not self.running:
            self.start_run()
        else:
            self.stop_requested = True

    def start_run(self):
        if self.running:
            return
        try:
            min_size = int(self.min_size_var.get())
            if min_size < 0:
                raise ValueError()
        except ValueError:
            messagebox.showwarning('입력오류', '최소 파일 크기는 0 이상의 정수여야 합니다.')
            return

        if not self.dir_list:
            messagebox.showwarning('경고', '하나 이상의 디렉토리를 추가하세요.')
            return

        self.running = True
        self.stop_requested = False
        self.btn_run.config(text='중단')

        self.file_info_list.clear()
        self.hash_groups.clear()
        self.tree_hash_groups.delete(*self.tree_hash_groups.get_children())
        self.tree_files.delete(*self.tree_files.get_children())

        original_title = self.title()

        target_files = []
        for directory in self.dir_list:
            for root, dirs, files in os.walk(directory):
                if self.stop_requested:
                    break
                for fname in files:
                    if self.stop_requested:
                        break
                    fpath = os.path.join(root, fname)
                    try:
                        size = os.path.getsize(fpath)
                        if size >= min_size:
                            target_files.append((fpath, size))
                    except Exception:
                        continue
            if self.stop_requested:
                break

        if self.stop_requested:
            self.title(original_title)
            self.btn_run.config(text='실행')
            self.running = False
            messagebox.showinfo('중단', '작업이 중단되었습니다.')
            return

        total_count = len(target_files)
        if total_count == 0:
            self.title(original_title)
            self.btn_run.config(text='실행')
            self.running = False
            messagebox.showinfo('알림', '조건에 맞는 파일이 없습니다.')
            return

        processed_count = 0
        for fpath, size in target_files:
            if self.stop_requested:
                break
            processed_count += 1
            percent = int(processed_count / total_count * 100)
            self.title(f'{original_title} - 진행: {processed_count}/{total_count} ({percent}%)')
            self.update()

            try:
                try:
                    mtime = os.path.getmtime(fpath)
                    if mtime == 0:
                        mtime = os.path.getctime(fpath)
                except Exception:
                    mtime = os.path.getctime(fpath)

                hashval = fast_hash(fpath)
                if hashval is not None:
                    finfo = FileInfo(fpath, size, mtime, hashval)
                    self.file_info_list.append(finfo)
            except Exception:
                continue

        if self.stop_requested:
            self.title(original_title)
            self.btn_run.config(text='실행')
            self.running = False
            messagebox.showinfo('중단', '작업이 중단되었습니다.')
            return

        # 그룹화 및 트리뷰 업데이트 (대표파일명 + 전체 크기, 크기 역순 정렬)
        self.hash_groups = {}
        for finfo in self.file_info_list:
            self.hash_groups.setdefault(finfo.hashval, []).append(finfo)
        self.hash_groups = {k: v for k, v in self.hash_groups.items() if len(v) > 1}

        hash_group_data = []
        for k, v in self.hash_groups.items():
            total_size = sum(f.size for f in v)
            rep_name = os.path.basename(v[0].path) if v else ''
            count = len(v)
            hash_group_data.append((count, k, rep_name, total_size))

        # 크기(total_size) 역순 정렬
        hash_group_data.sort(key=lambda x: x[3], reverse=True)

        for count, hashval, rep_name, total_size in hash_group_data:
            size_str = f'{total_size:,}'
            self.tree_hash_groups.insert('', 'end', values=(count, hashval, rep_name, size_str))

        self.title(original_title)
        self.btn_run.config(text='실행')
        self.running = False
        messagebox.showinfo('완료', f'총 {len(self.file_info_list)}개의 파일 중 중복 그룹 {len(self.hash_groups)}개를 찾았습니다.')

    def on_hash_group_selected(self, event):
        selected = self.tree_hash_groups.selection()
        if not selected:
            return
        item = self.tree_hash_groups.item(selected[0])
        hashval = item['values'][1]
        files = self.hash_groups.get(hashval, [])
        self.tree_files.delete(*self.tree_files.get_children())
        for i, finfo in enumerate(files, start=1):
            size_str = f'{finfo.size:,}'
            mtime_str = finfo.formatted_mtime()
            self.tree_files.insert('', 'end', values=(i, finfo.path, size_str, mtime_str, finfo.hashval))

    def delete_selected_files(self):
        selected_items = self.tree_files.selection()
        if not selected_items:
            messagebox.showwarning('경고', '삭제할 파일을 선택해 주십시오.')
            return
        files_to_delete = []
        for item in selected_items:
            values = self.tree_files.item(item, 'values')
            filepath = os.path.normpath(values[1])  # 경로 정규화
            files_to_delete.append(filepath)

        for fpath in files_to_delete:
            if not os.path.exists(fpath):
                messagebox.showwarning('경로 없음', f'파일 경로를 찾을 수 없습니다:\n{fpath}')
                continue
            try:
                send2trash.send2trash(fpath)
            except Exception as e:
                messagebox.showerror('오류', f'파일 삭제 실패: {fpath}\n{str(e)}')
                return
        messagebox.showinfo('완료', '선택한 파일을 휴지통으로 이동했습니다.')
        self.restore_selection()

    def restore_selection(self):
        for item in self.tree_files.selection():
            self.tree_files.selection_remove(item)

    def open_file_location(self, event):
        item = self.tree_files.identify_row(event.y)
        if not item:
            return
        vals = self.tree_files.item(item, 'values')
        if not vals or len(vals) < 2:
            return
        filepath = os.path.normpath(vals[1])  # 경로 정규화
        if not os.path.exists(filepath):
            messagebox.showerror('오류', '파일을 찾을 수 없습니다.')
            return

        folder = os.path.dirname(filepath)
        system = platform.system()

        try:
            if system == 'Windows':
                subprocess.run(['explorer', '/select,', filepath])
            elif system == 'Darwin':  # macOS
                subprocess.run(['open', '-R', filepath])
            else:  # Linux 등
                subprocess.run(['xdg-open', folder])
        except Exception as e:
            messagebox.showerror('오류', f'파일 탐색기 실행 실패: {e}')

if __name__ == '__main__':
    app = App()
    app.mainloop()