CodeQL queries 1.24

Skip to end of metadata
Go to start of metadata

Name: Non-constant format string

Description: Passing a non-constant 'format' string to a printf-like function can lead to a mismatch between the number of arguments defined by the 'format' and the number of arguments actually passed to the function. If the format string ultimately stems from an untrusted source, this can be used for exploits.

ID: cpp/non-constant-format

Kind: problem

Severity: recommendation

Precision: high

Query: NonConstantFormat.ql
/**
 * @name Non-constant format string
 * @description Passing a non-constant 'format' string to a printf-like function can lead
 *              to a mismatch between the number of arguments defined by the 'format' and the number
 *              of arguments actually passed to the function. If the format string ultimately stems
 *              from an untrusted source, this can be used for exploits.
 * @kind problem
 * @problem.severity recommendation
 * @precision high
 * @id cpp/non-constant-format
 * @tags maintainability
 *       correctness
 *       security
 *       external/cwe/cwe-134
 */

import semmle.code.cpp.dataflow.TaintTracking
import semmle.code.cpp.commons.Printf

// For the following `...gettext` functions, we assume that
// all translations preserve the type and order of `%` specifiers
// (and hence are safe to use as format strings).  This
// assumption is hard-coded into the query.
predicate whitelistFunction(Function f, int arg) {
  // basic variations of gettext
  f.getName() = "_" and arg = 0
  or
  f.getName() = "gettext" and arg = 0
  or
  f.getName() = "dgettext" and arg = 1
  or
  f.getName() = "dcgettext" and arg = 1
  or
  // plural variations of gettext that take one format string for singular and another for plural form
  f.getName() = "ngettext" and
  (arg = 0 or arg = 1)
  or
  f.getName() = "dngettext" and
  (arg = 1 or arg = 2)
  or
  f.getName() = "dcngettext" and
  (arg = 1 or arg = 2)
}

// we assume that ALL uses of the `_` macro
// return constant string literals
predicate underscoreMacro(Expr e) {
  exists(MacroInvocation mi |
    mi.getMacroName() = "_" and
    mi.getExpr() = e
  )
}

/**
 * Holds if `t` cannot hold a character array, directly or indirectly.
 */
predicate cannotContainString(Type t) {
  t.getUnspecifiedType() instanceof BuiltInType
  or
  t.getUnspecifiedType() instanceof IntegralOrEnumType
}

predicate isNonConst(DataFlow::Node node) {
  exists(Expr e | e = node.asExpr() |
    exists(FunctionCall fc | fc = e.(FunctionCall) |
      not (
        whitelistFunction(fc.getTarget(), _) or
        fc.getTarget().hasDefinition()
      )
    )
    or
    exists(Parameter p | p = e.(VariableAccess).getTarget().(Parameter) |
      p.getFunction().getName() = "main" and p.getType() instanceof PointerType
    )
    or
    e instanceof CrementOperation
    or
    e instanceof AddressOfExpr
    or
    e instanceof ReferenceToExpr
    or
    e instanceof AssignPointerAddExpr
    or
    e instanceof AssignPointerSubExpr
    or
    e instanceof PointerArithmeticOperation
    or
    e instanceof FieldAccess
    or
    e instanceof PointerDereferenceExpr
    or
    e instanceof AddressOfExpr
    or
    e instanceof ExprCall
    or
    e instanceof NewArrayExpr
    or
    e instanceof AssignExpr
    or
    exists(Variable v | v = e.(VariableAccess).getTarget() |
      v.getType().(ArrayType).getBaseType() instanceof CharType and
      exists(AssignExpr ae |
        ae.getLValue().(ArrayExpr).getArrayBase().(VariableAccess).getTarget() = v
      )
    )
  )
  or
  node instanceof DataFlow::DefinitionByReferenceNode
}

pragma[noinline]
predicate isSanitizerNode(DataFlow::Node node) {
  underscoreMacro(node.asExpr())
  or
  cannotContainString(node.getType())
}

class NonConstFlow extends TaintTracking::Configuration {
  NonConstFlow() { this = "NonConstFlow" }

  override predicate isSource(DataFlow::Node source) {
    isNonConst(source) and
    not cannotContainString(source.getType())
  }

  override predicate isSink(DataFlow::Node sink) {
    exists(FormattingFunctionCall fc | sink.asExpr() = fc.getArgument(fc.getFormatParameterIndex()))
  }

  override predicate isSanitizer(DataFlow::Node node) { isSanitizerNode(node) }
}

from FormattingFunctionCall call, Expr formatString
where
  call.getArgument(call.getFormatParameterIndex()) = formatString and
  exists(NonConstFlow cf, DataFlow::Node source, DataFlow::Node sink |
    cf.hasFlow(source, sink) and
    sink.asExpr() = formatString
  )
select formatString,
  "The format string argument to " + call.getTarget().getName() +
    " should be constant to prevent security issues and other potential errors."

The printf function, related functions like sprintf and fprintf, and other functions built atop vprintf all accept a format string as one of their arguments. When such format strings are literal constants, it is easy for the programmer (and static analysis tools) to verify that the format specifiers (such as %s and %02x) in the format string are compatible with the trailing arguments of the function call. When such format strings are not literal constants, it is more difficult to maintain the program: programmers (and static analysis tools) must perform non-local data-flow analysis to deduce what values the format string argument might take.

Recommendation

If the argument passed as a format string is meant to be a plain string rather than a format string, then pass %s as the format string, and pass the original argument as the sole trailing argument.

If the argument passed as a format string is a parameter to the enclosing function, then consider redesigning the enclosing function's API to be less brittle.

Example

The following program is meant to echo its command line arguments:

#include <stdio.h>
int main(int argc, char** argv) {
  for(int i = 1; i < argc; ++i) {
    printf(argv[i]);
  }
}

The above program behaves as expected in most cases, but breaks when one of its command line arguments contains a percent character. In such cases, the behavior of the program is undefined: it might echo garbage, it might crash, or it might give a malicious attacker root access. One way of addressing the problem is to use a constant %s format string, as in the following program:

#include <stdio.h>
int main(int argc, char** argv) {
  for(int i = 1; i < argc; ++i) {
    printf("%s", argv[i]);
  }
}

Example

The following program defines a log_with_timestamp function:

void log_with_timestamp(const char* message) {
  struct tm now;
  time(&now);
  printf("[%s] ", asctime(now));
  printf(message);
}

int main(int argc, char** argv) {
  log_with_timestamp("Application is starting...\n");
  /* ... */
  log_with_timestamp("Application is closing...\n");
  return 0;
}

In the code that is visible, the reader can verify that log_with_timestamp is never called with a log message containing a percent character, but even if all current calls are correct, this presents an ongoing maintenance burden to ensure that newly-introduced calls don't contain percent characters. As in the previous example, one solution is to make the log message a trailing argument of the function call:

void log_with_timestamp(const char* message) {
  struct tm now;
  time(&now);
  printf("[%s] %s", asctime(now), message);
}

int main(int argc, char** argv) {
  log_with_timestamp("Application is starting...\n");
  /* ... */
  log_with_timestamp("Application is closing...\n");
  return 0;
}

An alternative solution is to allow log_with_timestamp to accept format arguments:

void log_with_timestamp(const char* message, ...) {
  va_list args;
  va_start(args, message);
  struct tm now;
  time(&now);
  printf("[%s] ", asctime(now));
  vprintf(message, args);
  va_end(args);
}

int main(int argc, char** argv) {
  log_with_timestamp("%s is starting...\n", argv[0]);
  /* ... */
  log_with_timestamp("%s is closing...\n", argv[0]);
  return 0;
}

In this formulation, the non-constant format string to printf has been replaced with a non-constant format string to vprintf. Semmle will no longer consider the body of log_with_timestamp to be a problem, and will instead check that every call to log_with_timestamp passes a constant format string.

References