krb5 commit: Add Python scripts to check for C style issues

Thu Oct 4 10:35:43 EDT 2012

https://github.com/krb5/krb5/commit/70a119d4dc7ed7a94cfc32c523352af1d000e1c7
commit 70a119d4dc7ed7a94cfc32c523352af1d000e1c7
Author: Greg Hudson <ghudson at mit.edu>
Date:   Mon Sep 24 16:39:39 2012 -0400

    Add Python scripts to check for C style issues
    
    util/cstyle-file.py checks a file for C style issues and displays
    line-by-line output.  It is not terribly sophisticated, and can
    probably be improved upon (e.g. by doing an emacs batch-reindent of
    the file and checking for differences in indentation).
    
    util/cstyle.py produces diffs using git, runs the file checker on each
    modified C source file in each diff, and displays the output lines
    attribute to the diff.

 src/util/cstyle-file.py |  262 +++++++++++++++++++++++++++++++++++++++++++++++
 src/util/cstyle.py      |  188 +++++++++++++++++++++++++++++++++
 2 files changed, 450 insertions(+), 0 deletions(-)

diff --git a/src/util/cstyle-file.py b/src/util/cstyle-file.py
new file mode 100644
index 0000000..edf8a39
--- /dev/null
+++ b/src/util/cstyle-file.py
@@ -0,0 +1,262 @@
+# Copyright (C) 2012 by the Massachusetts Institute of Technology.
+# All rights reserved.
+#
+# Export of this software from the United States of America may
+#   require a specific license from the United States Government.
+#   It is the responsibility of any person or organization contemplating
+#   export to obtain such a license before exporting.
+#
+# WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+# distribute this software and its documentation for any purpose and
+# without fee is hereby granted, provided that the above copyright
+# notice appear in all copies and that both that copyright notice and
+# this permission notice appear in supporting documentation, and that
+# the name of M.I.T. not be used in advertising or publicity pertaining
+# to distribution of the software without specific, written prior
+# permission.  Furthermore if you modify this software you must label
+# your software as modified software and not distribute it in such a
+# fashion that it might be confused with the original M.I.T. software.
+# M.I.T. makes no representations about the suitability of
+# this software for any purpose.  It is provided "as is" without express
+# or implied warranty.
+
+# This program checks for some kinds of MIT krb5 coding style
+# violations in a single file.  Checked violations include:
+#
+#   Line is too long
+#   Tabs violations
+#   Trailing whitespace and final blank lines
+#   Comment formatting errors
+#   Preprocessor statements in function bodies
+#   Misplaced braces
+#   Space before paren in function call, or no space after if/for/while
+#   Parenthesized return expression
+#   Space after cast operator, or no space before * in cast operator
+#   Line broken before binary operator
+#   Lack of spaces around binary operator (sometimes)
+#   Assignment at the beginning of an if conditional
+#   Use of prohibited string functions
+#   Lack of braces around 2+ line flow control body
+#
+# This program does not check for the following:
+#
+#   Anything outside of a function body except line length/whitespace
+#   Anything non-syntactic (proper cleanup flow control, naming, etc.)
+#   Indentation or alignment of continuation lines
+#   UTF-8 violations
+#   Implicit tests against NULL or '\0'
+#   Inner-scope variable declarations
+#   Over- or under-parenthesization
+#   Long or deeply nested function bodies
+#   Syntax of function calls through pointers
+
+import re
+import sys
+
+def warn(ln, msg):
+    print '%5d  %s' % (ln, msg)
+
+
+def check_length(line, ln):
+    if len(line) > 79 and not line.startswith(' * Copyright'):
+        warn(ln, 'Length exceeds 79 characters')
+
+
+def check_tabs(line, ln, allow_tabs, seen_tab):
+    if not allow_tabs:
+        if '\t' in line:
+            warn(ln, 'Tab character in file which does not allow tabs')
+    else:
+        if ' \t' in line:
+            warn(ln, 'Tab character immediately following space')
+        if '        ' in line and seen_tab:
+            warn(ln, '8+ spaces in file which uses tabs')
+
+
+def check_trailing_whitespace(line, ln):
+    if line and line[-1] in ' \t':
+        warn(ln, 'Trailing whitespace')
+
+
+def check_comment(lines, ln):
+    align = lines[0].index('/*') + 1
+    if not lines[0].lstrip().startswith('/*'):
+        warn(ln, 'Multi-line comment begins after code')
+    for line in lines[1:]:
+        ln += 1
+        if len(line) <= align or line[align] != '*':
+            warn(ln, 'Comment line does not have * aligned with top')
+        elif line[:align].lstrip() != '':
+            warn(ln, 'Garbage before * in comment line')
+    if not lines[-1].rstrip().endswith('*/'):
+        warn(ln, 'Code after end of multi-line comment')
+    if len(lines) > 2 and (lines[0].strip() not in ('/*', '/**') or
+                           lines[-1].strip() != '*/'):
+        warn(ln, 'Comment is 3+ lines but is not formatted as block comment')
+
+
+def check_preprocessor(line, ln):
+    if line.startswith('#'):
+        warn(ln, 'Preprocessor statement in function body')
+
+
+def check_braces(line, ln):
+    # Strip out one-line initializer expressions.
+    line = re.sub(r'=\s*{.*}', '', line)
+    if line.lstrip().startswith('{') and not line.startswith('{'):
+        warn(ln, 'Un-cuddled open brace')
+    if re.search(r'{\s*\S', line):
+        warn(ln, 'Code on line after open brace')
+    if re.search(r'\S.*}', line):
+        warn(ln, 'Code on line before close brace')
+
+
+# This test gives false positives on function pointer type
+# declarations or casts.  Avoid this by using typedefs.
+def check_space_before_paren(line, ln):
+    for m in re.finditer(r'([\w]+)(\s*)\(', line):
+        ident, ws = m.groups()
+        if ident in ('if', 'for', 'while', 'switch'):
+            if not ws:
+                warn(ln, 'No space after flow control keyword')
+        elif ident != 'return':
+            if ws:
+                warn(ln, 'Space before paren in function call')
+
+
+def check_parenthesized_return(line, ln):
+    if re.search(r'return\s*\(.*\);', line):
+        warn(ln, 'Parenthesized return expression')
+
+
+def check_cast(line, ln):
+    # We can't reliably distinguish cast operators from parenthesized
+    # expressions or function call parameters without a real C parser,
+    # so we use some heuristics.  A cast operator is followed by an
+    # expression, which usually begins with an identifier or an open
+    # paren.  A function call or parenthesized expression is never
+    # followed by an identifier and only rarely by an open paren.  We
+    # won't detect a cast operator when it's followed by an expression
+    # beginning with '*', since it's hard to distinguish that from a
+    # multiplication operator.  We will get false positives from
+    # "(*fp) (args)" and "if (condition) statement", but both of those
+    # are erroneous anyway.
+    for m in re.finditer(r'\(([^(]+)\)(\s+)[a-zA-Z_]', line):
+        if m.group(2):
+            warn(ln, 'Space after cast operator (or inline if/while body)')
+        # Check for casts like (char*) which should have a space.
+        if re.search(r'[^\s\*]\*+$', m.group(1)):
+            warn(ln, 'No space before * in cast operator')
+
+
+def check_binary_operator(line, ln):
+    binop = r'(\+|-|\*|/|%|\^|==|=|!=|<=|<|>=|>|&&|&|\|\||\|)'
+    if re.match(r'\s*' + binop + r'\s', line):
+        warn(ln - 1, 'Line broken before binary operator')
+    if re.search(r'\w' + binop + r'\w', line):
+        warn(ln, 'No space before or after binary operator')
+
+
+def check_assignment_in_conditional(line, ln):
+    # Check specifically for if statements; we allow assignments in
+    # loop expressions.
+    if re.search(r'if\s*\(+\w+\s*=[^=]', line):
+        warn(ln, 'Assignment in if conditional')
+
+
+def check_unbraced_do(line, ln):
+    if re.match(r'\s*do$', line):
+        warn(ln, 'do statement without braces')
+
+
+def check_bad_string_fn(line, ln):
+    # This is intentionally pretty fuzzy so that we catch the whole scanf
+    if re.search(r'\W(strcpy|strcat|sprintf|\w*scanf)\W', line):
+        warn(ln, 'Prohibited string function')
+
+
+def check_file(lines):
+    # Check if this file allows tabs.
+    if len(lines) == 0:
+        return
+    allow_tabs = 'indent-tabs-mode: nil' not in lines[0]
+    seen_tab = False
+
+    in_function = False
+    unbraced_flow_body_count = -1
+    comment = []
+    ln = 0
+    for line in lines:
+        ln += 1
+        line = line.rstrip('\r\n')
+        indent = len(re.match('\s*', line).group(0).expandtabs())
+        seen_tab = seen_tab or ('\t' in line)
+
+        # Check line structure issues before altering the line.
+        check_length(line, ln)
+        check_tabs(line, ln, allow_tabs, seen_tab)
+        check_trailing_whitespace(line, ln)
+
+        # Strip out single-line comments the contents of string literals.
+        if not comment:
+            line = re.sub(r'/\*.*?\*/', '', line)
+            line = re.sub(r'"(\\.|[^"])*"', '""', line)
+
+        # Parse out and check multi-line comments.  (Ignore code on
+        # the first or last line; check_comment will warn about it.)
+        if comment or '/*' in line:
+            comment.append(line)
+            if '*/' in line:
+                check_comment(comment, ln - len(comment) + 1)
+                comment = []
+            continue
+
+        # Warn if we see a // comment and ignore anything following.
+        if '//' in line:
+            warn(ln, '// comment')
+            line = re.sub(r'//.*/', '', line)
+
+        if line.startswith('{'):
+            in_function = True
+        elif line.startswith('}'):
+            in_function = False
+
+        if in_function:
+            check_preprocessor(line, ln)
+            check_braces(line, ln)
+            check_space_before_paren(line, ln)
+            check_parenthesized_return(line, ln)
+            check_cast(line, ln)
+            check_binary_operator(line, ln)
+            check_assignment_in_conditional(line, ln)
+            check_unbraced_do(line, ln)
+            check_bad_string_fn(line, ln)
+
+            # Check for unbraced flow statement bodies.
+            if unbraced_flow_body_count != -1:
+                if indent > flow_statement_indent:
+                    unbraced_flow_body_count += 1
+                else:
+                    if unbraced_flow_body_count > 1:
+                        warn(ln - 1, 'Body is 2+ lines but has no braces')
+                    unbraced_flow_body_count = -1
+            if (re.match('(\s*)(if|else if|for|while)\s*\(.*\)$', line) or
+                re.match('(\s*)else$', line)):
+                unbraced_flow_body_count = 0
+                flow_statement_indent = indent
+
+    if lines[-1] == '':
+        warn(ln, 'Blank line at end of file')
+
+
+if len(sys.argv) == 1:
+    lines = sys.stdin.readlines()
+elif len(sys.argv) == 2:
+    f = open(sys.argv[1])
+    lines = f.readlines()
+    f.close()
+else:
+    sys.stderr.write('Usage: cstyle-file [filename]\n')
+    sys.exit(1)
+
+check_file(lines)
diff --git a/src/util/cstyle.py b/src/util/cstyle.py
new file mode 100644
index 0000000..7c45335
--- /dev/null
+++ b/src/util/cstyle.py
@@ -0,0 +1,188 @@
+# Copyright (C) 2012 by the Massachusetts Institute of Technology.
+# All rights reserved.
+#
+# Export of this software from the United States of America may
+#   require a specific license from the United States Government.
+#   It is the responsibility of any person or organization contemplating
+#   export to obtain such a license before exporting.
+#
+# WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+# distribute this software and its documentation for any purpose and
+# without fee is hereby granted, provided that the above copyright
+# notice appear in all copies and that both that copyright notice and
+# this permission notice appear in supporting documentation, and that
+# the name of M.I.T. not be used in advertising or publicity pertaining
+# to distribution of the software without specific, written prior
+# permission.  Furthermore if you modify this software you must label
+# your software as modified software and not distribute it in such a
+# fashion that it might be confused with the original M.I.T. software.
+# M.I.T. makes no representations about the suitability of
+# this software for any purpose.  It is provided "as is" without express
+# or implied warranty.
+
+# This program attempts to detect MIT krb5 coding style violations
+# attributable to the changes a series of git commits.  It can be run
+# from anywhere within a git working tree.
+
+import getopt
+import os
+import re
+import sys
+from subprocess import Popen, PIPE, call
+
+def usage():
+    u = ['Usage: cstyle [-w] [rev|rev1..rev2]',
+         '',
+         'By default, checks working tree against HEAD, or checks changes in',
+         'HEAD if the working tree is clean.  With a revision option, checks',
+         'changes in rev or the series rev1..rev2.  With the -w option,',
+         'checks working tree against rev (defaults to HEAD).']
+    sys.stderr.write('\n'.join(u) + '\n')
+    sys.exit(1)
+
+
+# Run a command and return a list of its output lines.
+def run(args):
+    # subprocess.check_output would be ideal here, but requires Python 2.7.
+    p = Popen(args, stdout=PIPE, stderr=PIPE)
+    out, err = p.communicate()
+    if p.returncode != 0:
+        sys.stderr.write('Failed command: ' + ' '.join(args) + '\n')
+        if err != '':
+            sys.stderr.write('stderr:\n' + err)
+        sys.stderr.write('Unexpected command failure, exiting\n')
+        sys.exit(1)
+    return out.splitlines()
+
+
+# Find the top level of the git working tree, or None if we're not in
+# one.
+def find_toplevel():
+    # git doesn't seem to have a way to do this, so we search by hand.
+    dir = os.getcwd()
+    while True:
+        if os.path.exists(os.path.join(dir, '.git')):
+            break
+        parent = os.path.dirname(dir)
+        if (parent == dir):
+            return None
+        dir = parent
+    return dir
+
+
+# Check for style issues in a file within rev (or within the current
+# checkout if rev is None).  Report only problems on line numbers in
+# new_lines.
+line_re = re.compile(r'^\s*(\d+)  (.*)$')
+def check_file(filename, rev, new_lines):
+    # Process only C source files under src.
+    root, ext = os.path.splitext(filename)
+    if not filename.startswith('src/') or ext not in ('.c', '.h', '.hin'):
+        return
+    dispname = filename[4:]
+
+    if rev is None:
+        p1 = Popen(['cat', filename], stdout=PIPE)
+    else:
+        p1 = Popen(['git', 'show', rev + ':' + filename], stdout=PIPE)
+    p2 = Popen(['python', 'src/util/cstyle-file.py'], stdin=p1.stdout,
+               stdout=PIPE)
+    p1.stdout.close()
+    out, err = p2.communicate()
+    if p2.returncode != 0:
+        sys.exit(1)
+
+    first = True
+    for line in out.splitlines():
+        m = line_re.match(line)
+        if int(m.group(1)) in new_lines:
+            if first:
+                print '  ' + dispname + ':'
+                first = False
+            print '    ' + line
+
+
+# Determine the lines of each file modified by diff (a sequence of
+# strings) and check for style violations in those lines.  rev
+# indicates the version in which the new contents of each file can be
+# found, or is None if the current contents are in the working copy.
+chunk_header_re = re.compile(r'^@@ -\d+(,(\d+))? \+(\d+)(,(\d+))? @@')
+def check_diff(diff, rev):
+    old_count, new_count, lineno = 0, 0, 0
+    filename = None
+    for line in diff:
+        if not line or line.startswith('\\ No newline'):
+            continue
+        if old_count > 0 or new_count > 0:
+            # We're in a chunk.
+            if line[0] == '+':
+                new_lines.append(lineno)
+            if line[0] in ('+', ' '):
+                new_count = new_count - 1
+                lineno = lineno + 1
+            if line[0] in ('-', ' '):
+                old_count = old_count - 1
+        elif line.startswith('+++ b/'):
+            # We're starting a new file.  Check the last one.
+            if filename:
+                check_file(filename, rev, new_lines)
+            filename = line[6:]
+            new_lines = []
+        else:
+            m = chunk_header_re.match(line)
+            if m:
+                old_count = int(m.group(2) or '1')
+                lineno = int(m.group(3))
+                new_count = int(m.group(5) or '1')
+
+    # Check the last file in the diff.
+    if filename:
+        check_file(filename, rev, new_lines)
+
+
+# Check a sequence of revisions for style issues.
+def check_series(revlist):
+    for rev in revlist:
+        sys.stdout.flush()
+        call(['git', 'show', '-s', '--oneline', rev])
+        diff = run(['git', 'diff-tree', '--no-commit-id', '--root', '-M',
+                    '--cc', rev])
+        check_diff(diff, rev)
+
+
+# Parse arguments.
+try:
+    opts, args = getopt.getopt(sys.argv[1:], 'w')
+except getopt.GetoptError, err:
+    print str(err)
+    usage()
+if len(args) > 1:
+    usage()
+
+# Change to the top level of the working tree so we easily run the file
+# checker and refer to working tree files.
+toplevel = find_toplevel()
+if toplevel is None:
+    sys.stderr.write('%s must be run within a git working tree')
+os.chdir(toplevel)
+
+if ('-w', '') in opts:
+    # Check the working tree against a base revision.
+    arg = 'HEAD'
+    if args:
+        arg = args[0]
+    check_diff(run(['git', 'diff', arg]), None)
+elif args:
+    # Check the differences in a rev or a series of revs.
+    if '..' in args[0]:
+        check_series(run(['git', 'rev-list', '--reverse', args[0]]))
+    else:
+        check_series([args[0]])
+else:
+    # No options or arguments.  Check the differences against HEAD, or
+    # the differences in HEAD if the working tree is clean.
+    diff = run(['git', 'diff', 'HEAD'])
+    if diff:
+        check_diff(diff, None)
+    else:
+        check_series(['HEAD'])