Python script for searching directories and outputting file paths

Need a Python script to search directories and generate a list of file paths? Want to be able to exclude certain files and directories using relative paths and regular expressions? The Python 3 script below does this. The script is also available on GitHub.

How do I run the script below?

Use the JSON file you create as the first argument:

python3 create_files_list.py <input_json_file>

The Python Code

import os
import re
import json
import sys

# write to a log file and console
def log(message, file_handle):
    file_handle.write(message + "\n")
    print(message)

# normalize the slashes in the paths, and remove case sensitivity for comparison on Windows
def prepare_paths(input_paths):
    paths = []
    for path in input_paths:
        paths.append(os.path.normcase(os.path.normpath(path)))
    return paths

def create_files_list_walk(search_path, excludes, log_path, manifest_path, include_regex=[], exclude_regex=[]):
    search_path = os.path.normcase(os.path.normpath(search_path))

    with open(log_path, 'a') as log_file, open(manifest_path, 'a') as manifest_file:
        log('searching: {0}'.format(search_path), log_file)
        files = os.listdir(search_path)
        for f in files:
            full_path = os.path.normcase(os.path.normpath(os.path.join(search_path, f)))
            log('{0}'.format(full_path), log_file)
            if full_path in excludes:
                log("excluded: " + full_path, log_file)
                continue

            if os.path.isdir(full_path):
                create_files_list_walk(full_path, excludes, log_path, manifest_path, include_regex, exclude_regex)
                continue

            skip_file = False
            if include_regex:
                for pattern in include_regex:
                    if not re.match(pattern, full_path):
                        skip_file = True
                        log("included ({0}): {1}".format(pattern, full_path), log_file)
                        break
                if skip_file:
                    continue

            if exclude_regex:
                for pattern in exclude_regex:
                    if re.match(pattern, full_path):
                        skip_file = True
                        log("excluded ({0}): {1}".format(pattern, full_path), log_file)
                        break
                if skip_file:
                    continue

            manifest_file.write(full_path + "\n")

def create_files_list(config_path):
    with open(config_path, 'r') as config_file:
        json_config = json.loads(config_file.read())

    log_path = json_config['log']
    manifest_path = json_config['output']
    configs = json_config['dirs']

    # clear the log and manifest files
    with open(log_path, 'w'), open(manifest_path, 'w'):
        pass

    for config in configs:
        # convert the relative excluded paths to full paths
        excludes = []
        for rel_path in config['exclude']:
            excludes.append(os.path.normcase(os.path.normpath(os.path.join(config['dir'], rel_path))))

        excludes = prepare_paths(excludes)
        create_files_list_walk(config['dir'], excludes, log_path, manifest_path, config['include_regex'], config['exclude_regex'])

create_files_list(sys.argv[1])

Sample JSON Input File

This JSON file indicates that we want to recursively search two directories for files. We also want to:

  1. Exclude the c:/projects/nodetest/node_modules directory
  2. Only include files with the extension .js.
  3. Exclude files with the extension .min.js.
{
    "output": "c:/projects/.emacs-js-tags-files",
    "log": "c:/projects/.emacs-js-tags-files.log",
    "dirs": [{
        "dir": "c:/projects",
        "exclude": ["nodetest/node_modules"],
        "include_regex": [".*?\\.js$"],
        "exclude_regex": [".*?\\.min\\.js$"]
    },{
        "dir": "c:/projects2",
        "exclude": [],
        "include_regex": [".*?\\.js$"],
        "exclude_regex": [".*?\\.min\\.js$"]
    }]
}

Comments

Leave a comment

What color are brown eyes? (spam prevention)
Submit
Code under MIT License unless otherwise indicated.
© 2020, Downranked, LLC.