Semmle 1.20
Skip to end of metadata
Go to start of metadata

Name: Non-constant format string

Description: Passing a non-constant 'format' string to a printf-like function can lead to a mismatch between the number of arguments defined by the 'format' and the number of arguments actually passed to the function. If the format string ultimately stems from an untrusted source, this can be used for exploits.

ID: cpp/non-constant-format

Kind: problem

Severity: recommendation

Precision: high

Query: NonConstantFormat.ql
/**
 * @name Non-constant format string
 * @description Passing a non-constant 'format' string to a printf-like function can lead
 *              to a mismatch between the number of arguments defined by the 'format' and the number
 *              of arguments actually passed to the function. If the format string ultimately stems
 *              from an untrusted source, this can be used for exploits.
 * @kind problem
 * @problem.severity recommendation
 * @precision high
 * @id cpp/non-constant-format
 * @tags maintainability
 *       correctness
 *       security
 *       external/cwe/cwe-134
 */
import cpp

/**
 * Holds if `t` is a character array or pointer, where `charType` is the type of
 * characters in `t`.
 */
predicate stringType(Type t, Type charType) {
  (
    ( charType = t.(PointerType).getBaseType() or
      charType = t.(ArrayType).getBaseType()
    ) and (
      charType.getUnspecifiedType() instanceof CharType or
      charType.getUnspecifiedType() instanceof Wchar_t
    )
  )
  or
  stringType(t.getUnderlyingType(), charType)
  or
  stringType(t.(SpecifiedType).getBaseType(), charType)
}

predicate gettextFunction(Function f, int arg) {
  // basic variations of gettext
  (f.getName() =          "_" and arg = 0) or
  (f.getName() =    "gettext" and arg = 0) or
  (f.getName() =   "dgettext" and arg = 1) or
  (f.getName() =  "dcgettext" and arg = 1) or
  // plural variations of gettext that take one format string for singular and another for plural form
  (f.getName() =   "ngettext" and (arg = 0 or arg = 1)) or
  (f.getName() =  "dngettext" and (arg = 1 or arg = 2)) or
  (f.getName() = "dcngettext" and (arg = 1 or arg = 2))
}

predicate stringArray(Variable arr, AggregateLiteral init) {
  arr.getInitializer().getExpr() = init and
  stringType(arr.getType().getUnspecifiedType().(ArrayType).getBaseType(), _)
  // Ideally, this predicate should also check that no item of `arr` is ever
  // reassigned, but such an analysis could get fairly complicated. Instead, we
  // just hope that nobody would initialize an array of constants and then
  // overwrite some of them with untrusted data.
}

predicate underscoreMacro(Expr e) {
  exists(MacroInvocation mi |
    mi.getMacroName() = "_" and
    mi.getExpr() = e
  )
}

/**
 * Holds if a value of character pointer type may flow _directly_ from `src` to
 * `dst`.
 */
predicate stringFlowStep(Expr src, Expr dst) {
  not underscoreMacro(dst)
  and
  stringType(dst.getType(), _)
  and
  (
    src = dst.(VariableAccess).getTarget().getAnAssignedValue()
    or
    src = dst.(ConditionalExpr).getThen()
    or
    src = dst.(ConditionalExpr).getElse()
    or
    src = dst.(AssignExpr).getRValue()
    or
    src = dst.(CommaExpr).getRightOperand()
    or
    src = dst.(UnaryPlusExpr).getOperand()
    or
    stringArray(dst.(ArrayExpr).getArrayBase().(VariableAccess).getTarget(),
                src.getParent())
    or
    // Follow a parameter to its arguments in all callers.
    exists(Parameter p | p = dst.(VariableAccess).getTarget() |
      src = p.getFunction().getACallToThisFunction().getArgument(p.getIndex())
    )
    or
    // Follow a call to a gettext function without looking at its body even if
    // the body is known. This ensures that we report the error in the relevant
    // location rather than inside the body of some `_` function.
    exists(Function gettext, FunctionCall call, int idx |
      gettextFunction(gettext, idx) and
      dst = call and
      gettext = call.getTarget()
    | src = call.getArgument(idx)
    )
    or
    // Follow a call to a non-gettext function through its return statements.
    exists(Function f, ReturnStmt retStmt |
      f = dst.(FunctionCall).getTarget() and
      f = retStmt.getEnclosingFunction() and
      not gettextFunction(f, _)
    | src = retStmt.getExpr()
    )
  )
}

/** Holds if `v` may be written to, other than through `AssignExpr`. */
predicate nonConstVariable(Variable v) {
  exists(Type charType |
    stringType(v.getType(), charType) and not charType.isConst()
  )
  or
  exists(AssignPointerAddExpr e |
    e.getLValue().(VariableAccess).getTarget() = v
  )
  or
  exists(AssignPointerSubExpr e |
    e.getLValue().(VariableAccess).getTarget() = v
  )
  or
  exists(CrementOperation e | e.getOperand().(VariableAccess).getTarget() = v)
  or
  exists(AddressOfExpr e | e.getOperand().(VariableAccess).getTarget() = v)
  or
  exists(ReferenceToExpr e | e.getExpr().(VariableAccess).getTarget() = v)
}

/**
 * Holds if this expression is _directly_ considered non-constant for the
 * purpose of this query.
 *
 * Each case of `Expr` that may have string type should be listed either in
 * `nonConst` or `stringFlowStep`. Omitting a case leads to false negatives.
 * Having a case in both places leads to unnecessary computation.
 */
predicate nonConst(Expr e) {
  nonConstVariable(e.(VariableAccess).getTarget())
  or
  e instanceof CrementOperation
  or
  e instanceof AssignPointerAddExpr
  or
  e instanceof AssignPointerSubExpr
  or
  e instanceof PointerArithmeticOperation
  or
  e instanceof FieldAccess
  or
  e instanceof PointerDereferenceExpr
  or
  e instanceof AddressOfExpr
  or
  e instanceof ExprCall
  or
  exists(ArrayExpr ae | ae = e |
    not stringArray(ae.getArrayBase().(VariableAccess).getTarget(), _)
  )
  or
  e instanceof NewArrayExpr
  or
  // e is a call to a function whose definition we cannot see. We assume it is
  // not constant.
  exists(Function f | f = e.(FunctionCall).getTarget() |
    not f.hasDefinition()
  )
  or
  // e is a parameter of a function that's never called. If it were called, we
  // would instead have followed the call graph in `stringFlowStep`.
  exists(Function f
  | f = e.(VariableAccess).getTarget().(Parameter).getFunction()
  | not exists(f.getACallToThisFunction())
  )
}

predicate formattingFunctionArgument(
    FormattingFunction ff, FormattingFunctionCall fc, Expr arg)
{
  fc.getTarget() = ff and
  fc.getArgument(ff.getFormatParameterIndex()) = arg and
  // Don't look for errors inside functions that are themselves formatting
  // functions. We expect that the interesting errors will be in their callers.
  not fc.getEnclosingFunction() instanceof FormattingFunction
}

// Reflexive-transitive closure of `stringFlow`, restricting the base case to
// only consider destination expressions that are arguments to formatting
// functions.
predicate stringFlow(Expr src, Expr dst) {
  formattingFunctionArgument(_, _, dst) and src = dst
  or
  exists(Expr mid | stringFlowStep(src, mid) and stringFlow(mid, dst))
}

predicate whitelisted(Expr e) {
  gettextFunction(e.(FunctionCall).getTarget(), _)
  or
  underscoreMacro(e)
}

predicate flowFromNonConst(Expr src, Expr dst) {
  stringFlow(src, dst) and
  nonConst(src) and
  not whitelisted(src)
}

from FormattingFunctionCall fc, FormattingFunction ff, Expr format
where formattingFunctionArgument(ff, fc, format) and
      flowFromNonConst(_, format) and
      not fc.isInMacroExpansion()
select format, "The format string argument to " + ff.getName() + " should be constant to prevent security issues and other potential errors."

The printf function, related functions like sprintf and fprintf, and other functions built atop vprintf all accept a format string as their first argument. When such format strings are literal constants, it is easy for the programmer (and static analysis tools) to verify that the format specifiers (such as %s and %02x) in the format string are compatible with the trailing arguments of the function call. When such format strings are not literal constants, it is more difficult to maintain the program: programmers (and static analysis tools) must perform non-local data-flow analysis to deduce what values the format string argument might take.

Recommendation

If the argument passed as a format string is meant to be a plain string rather than a format string, then pass %s as the format string, and pass the original argument as the sole trailing argument.

If the argument passed as a format string is a parameter to the enclosing function, then consider redesigning the enclosing function's API to be less brittle.

Example

The following program is meant to echo its command line arguments:

#include <stdio.h>
int main(int argc, char** argv) {
  for(int i = 1; i < argc; ++i) {
    printf(argv[i]);
  }
}

The above program behaves as expected in most cases, but breaks when one of its command line arguments contains a percent character. In such cases, the behavior of the program is undefined: it might echo garbage, it might crash, or it might give a malicious attacker root access. One way of addressing the problem is to use a constant %s format string, as in the following program:

#include <stdio.h>
int main(int argc, char** argv) {
  for(int i = 1; i < argc; ++i) {
    printf("%s", argv[i]);
  }
}

Example

The following program defines a log_with_timestamp function:

void log_with_timestamp(const char* message) {
  struct tm now;
  time(&now);
  printf("[%s] ", asctime(now));
  printf(message);
}

int main(int argc, char** argv) {
  log_with_timestamp("Application is starting...\n");
  /* ... */
  log_with_timestamp("Application is closing...\n");
  return 0;
}

In the code that is visible, the reader can verify that log_with_timestamp is never called with a log message containing a percent character, but even if all current calls are correct, this presents an ongoing maintenance burden to ensure that newly-introduced calls don't contain percent characters. As in the previous example, one solution is to make the log message a trailing argument of the function call:

void log_with_timestamp(const char* message) {
  struct tm now;
  time(&now);
  printf("[%s] %s", asctime(now), message);
}

int main(int argc, char** argv) {
  log_with_timestamp("Application is starting...\n");
  /* ... */
  log_with_timestamp("Application is closing...\n");
  return 0;
}

An alternative solution is to allow log_with_timestamp to accept format arguments:

void log_with_timestamp(const char* message, ...) {
  va_list args;
  va_start(args, message);
  struct tm now;
  time(&now);
  printf("[%s] ", asctime(now));
  vprintf(message, args);
  va_end(args);
}

int main(int argc, char** argv) {
  log_with_timestamp("%s is starting...\n", argv[0]);
  /* ... */
  log_with_timestamp("%s is closing...\n", argv[0]);
  return 0;
}

In this formulation, the non-constant format string to printf has been replaced with a non-constant format string to vprintf. Semmle will no longer consider the body of log_with_timestamp to be a problem, and will instead check that every call to log_with_timestamp passes a constant format string.

References