diff --git a/python/ql/src/Expressions/HashedButNoHash.ql b/python/ql/src/Expressions/HashedButNoHash.ql index 705806bf3605..6f4dab12570c 100644 --- a/python/ql/src/Expressions/HashedButNoHash.ql +++ b/python/ql/src/Expressions/HashedButNoHash.ql @@ -12,76 +12,137 @@ */ import python -private import LegacyPointsTo +import semmle.python.dataflow.new.DataFlow +private import semmle.python.dataflow.new.internal.DataFlowDispatch +private import semmle.python.ApiGraphs -/* - * This assumes that any indexing operation where the value is not a sequence or numpy array involves hashing. - * For sequences, the index must be an int, which are hashable, so we don't need to treat them specially. - * For numpy arrays, the index may be a list, which are not hashable and needs to be treated specially. +/** + * Holds if `cls` explicitly sets `__hash__` to `None`, making instances unhashable. */ +predicate setsHashToNone(Class cls) { + DuckTyping::getAnAttributeValue(cls, "__hash__") instanceof None +} + +/** + * Holds if `cls` has a `@dataclass` decorator with `frozen=True` or `unsafe_hash=True`, + * which generates a `__hash__` method at runtime. + */ +predicate hasDataclassHash(Class cls) { + exists(DataFlow::CallCfgNode decorator | + decorator = API::moduleImport("dataclasses").getMember("dataclass").getACall() and + decorator.asExpr() = cls.getADecorator() and + decorator.getArgByName(["frozen", "unsafe_hash"]).asExpr().(ImmutableLiteral).booleanValue() = + true + ) +} -predicate numpy_array_type(ClassValue na) { - exists(ModuleValue np | np.getName() = "numpy" or np.getName() = "numpy.core" | - na.getASuperType() = np.attr("ndarray") +/** + * Holds if `cls` defines `__eq__` directly (not inherited from `object`), + * or has a `@dataclass` decorator that generates `__eq__` (the default). + */ +predicate definesEq(Class cls) { + DuckTyping::hasMethod(cls, "__eq__") + or + // @dataclass(...) call generates __eq__ unless eq=False + exists(DataFlow::CallCfgNode decorator | + decorator = API::moduleImport("dataclasses").getMember("dataclass").getACall() and + decorator.asExpr() = cls.getADecorator() and + not decorator.getArgByName("eq").asExpr().(ImmutableLiteral).booleanValue() = false ) + or + // bare @dataclass without arguments also generates __eq__ + cls.getADecorator() = + API::moduleImport("dataclasses").getMember("dataclass").getAValueReachableFromSource().asExpr() } -predicate has_custom_getitem(Value v) { - v.getClass().lookup("__getitem__") instanceof PythonFunctionValue +/** + * Holds if `cls` is a user-defined class whose instances are unhashable. + * + * In Python, a class is unhashable if: + * - It explicitly sets `__hash__ = None`, or + * - It defines `__eq__` (directly or via decorator like `@dataclass`) without + * defining `__hash__`, and doesn't have a decorator that generates `__hash__`. + */ +predicate isUnhashableUserClass(Class cls) { + setsHashToNone(cls) or - numpy_array_type(v.getClass()) + // In Python 3, defining __eq__ without __hash__ makes a class unhashable. + // This rule does not apply in Python 2. + major_version() = 3 and + DuckTyping::isNewStyle(cls) and + definesEq(cls) and + not DuckTyping::hasMethod(cls, "__hash__") and + not hasDataclassHash(cls) and + not DuckTyping::hasUnresolvedBase(getADirectSuperclass*(cls)) } -predicate explicitly_hashed(ControlFlowNode f) { - exists(CallNode c, GlobalVariable hash | - c.getArg(0) = f and c.getFunction().(NameNode).uses(hash) and hash.getId() = "hash" +/** + * Gets the name of a builtin type whose instances are unhashable. + */ +string getUnhashableBuiltinName() { result = ["list", "set", "dict", "bytearray"] } + +/** + * Holds if `origin` is a local source node tracking an unhashable instance that + * flows to `node`, with `clsName` describing the class for the alert. + */ +predicate isUnhashable(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) { + exists(Class c | + isUnhashableUserClass(c) and + origin = classInstanceTracker(c) and + origin.flowsTo(node) and + clsName = c.getName() ) + or + clsName = getUnhashableBuiltinName() and + origin = API::builtin(clsName).getAnInstance().asSource() and + origin.flowsTo(node) +} + +predicate explicitly_hashed(DataFlow::Node node) { + node = API::builtin("hash").getACall().getArg(0) } -predicate unhashable_subscript(ControlFlowNode f, ClassValue c, ControlFlowNode origin) { - is_unhashable(f, c, origin) and - exists(SubscriptNode sub | sub.getIndex() = f | - exists(Value custom_getitem | - sub.getObject().(ControlFlowNodeWithPointsTo).pointsTo(custom_getitem) and - not has_custom_getitem(custom_getitem) - ) +/** + * Holds if the subscript object in `sub[...]` is known to use hashing for indexing, + * i.e. it does not have a custom `__getitem__` that could accept unhashable indices. + */ +predicate subscriptUsesHashing(Subscript sub) { + DataFlow::exprNode(sub.getObject()) = + API::builtin("dict").getAnInstance().getAValueReachableFromSource() + or + exists(Class cls | + classInstanceTracker(cls) + .(DataFlow::LocalSourceNode) + .flowsTo(DataFlow::exprNode(sub.getObject())) and + not DuckTyping::hasMethod(cls, "__getitem__") ) } -predicate is_unhashable(ControlFlowNodeWithPointsTo f, ClassValue cls, ControlFlowNode origin) { - exists(Value v | f.pointsTo(v, origin) and v.getClass() = cls | - not cls.hasAttribute("__hash__") and not cls.failedInference(_) and cls.isNewStyle() - or - cls.lookup("__hash__") = Value::named("None") +predicate unhashable_subscript(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) { + exists(Subscript sub | + node = DataFlow::exprNode(sub.getIndex()) and + subscriptUsesHashing(sub) + | + isUnhashable(origin, node, clsName) ) } /** - * Holds if `f` is inside a `try` that catches `TypeError`. For example: - * - * try: - * ... f ... - * except TypeError: - * ... - * - * This predicate is used to eliminate false positive results. If `hash` - * is called on an unhashable object then a `TypeError` will be thrown. - * But this is not a bug if the code catches the `TypeError` and handles - * it. + * Holds if `e` is inside a `try` that catches `TypeError`. */ -predicate typeerror_is_caught(ControlFlowNode f) { +predicate typeerror_is_caught(Expr e) { exists(Try try | - try.getBody().contains(f.getNode()) and - try.getAHandler().getType().(ExprWithPointsTo).pointsTo(ClassValue::typeError()) + try.getBody().contains(e) and + try.getAHandler().getType() = API::builtin("TypeError").getAValueReachableFromSource().asExpr() ) } -from ControlFlowNode f, ClassValue c, ControlFlowNode origin +from DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName where - not typeerror_is_caught(f) and + not typeerror_is_caught(node.asExpr()) and ( - explicitly_hashed(f) and is_unhashable(f, c, origin) + explicitly_hashed(node) and isUnhashable(origin, node, clsName) or - unhashable_subscript(f, c, origin) + unhashable_subscript(origin, node, clsName) ) -select f.getNode(), "This $@ of $@ is unhashable.", origin, "instance", c, c.getQualifiedName() +select node, "This $@ of $@ is unhashable.", origin, "instance", origin, clsName diff --git a/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.expected b/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.expected new file mode 100644 index 000000000000..c9354099a12c --- /dev/null +++ b/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.expected @@ -0,0 +1,2 @@ +| test.py:17:17:17:19 | ControlFlowNode for obj | This $@ of $@ is unhashable. | test.py:16:11:16:23 | ControlFlowNode for HasEqNoHash() | instance | test.py:16:11:16:23 | ControlFlowNode for HasEqNoHash() | HasEqNoHash | +| test.py:48:17:48:17 | ControlFlowNode for p | This $@ of $@ is unhashable. | test.py:47:9:47:27 | ControlFlowNode for MutableDataclass() | instance | test.py:47:9:47:27 | ControlFlowNode for MutableDataclass() | MutableDataclass | diff --git a/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.qlref b/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.qlref new file mode 100644 index 000000000000..5842132bd430 --- /dev/null +++ b/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.qlref @@ -0,0 +1 @@ +Expressions/HashedButNoHash.ql diff --git a/python/ql/test/3/query-tests/Expressions/general/test.py b/python/ql/test/3/query-tests/Expressions/general/test.py new file mode 100644 index 000000000000..3c900c19f0b7 --- /dev/null +++ b/python/ql/test/3/query-tests/Expressions/general/test.py @@ -0,0 +1,48 @@ +# Classes without __eq__ are hashable (inherit __hash__ from object) +class NoEqClass: + def __init__(self, x): + self.x = x + +def hash_no_eq(): + obj = NoEqClass(1) + return hash(obj) # OK: no __eq__, so __hash__ is inherited from object + +# Classes with __eq__ but no __hash__ are unhashable +class HasEqNoHash: + def __eq__(self, other): + return self.x == other.x + +def hash_eq_no_hash(): + obj = HasEqNoHash() + return hash(obj) # should be flagged + +# @dataclass(frozen=True) generates __hash__, so instances are hashable +from dataclasses import dataclass + +@dataclass(frozen=True) +class FrozenPoint: + x: int + y: int + +def hash_frozen_dataclass(): + p = FrozenPoint(1, 2) + return hash(p) # OK: frozen dataclass has __hash__ + +# @dataclass(unsafe_hash=True) also generates __hash__ +@dataclass(unsafe_hash=True) +class UnsafeHashPoint: + x: int + y: int + +def hash_unsafe_hash_dataclass(): + p = UnsafeHashPoint(1, 2) + return hash(p) # OK: unsafe_hash=True generates __hash__ + +# Plain @dataclass without frozen/unsafe_hash and with __eq__ is unhashable +@dataclass +class MutableDataclass: + x: int + +def hash_mutable_dataclass(): + p = MutableDataclass(1) + return hash(p) # should be flagged: @dataclass generates __eq__ but not __hash__ diff --git a/python/ql/test/query-tests/Expressions/general/HashedButNoHash.expected b/python/ql/test/query-tests/Expressions/general/HashedButNoHash.expected index 9589547056f7..536a7281d2d6 100644 --- a/python/ql/test/query-tests/Expressions/general/HashedButNoHash.expected +++ b/python/ql/test/query-tests/Expressions/general/HashedButNoHash.expected @@ -1 +1 @@ -| expressions_test.py:42:20:42:25 | unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | file://:0:0:0:0 | builtin-class list | list | +| expressions_test.py:42:20:42:25 | ControlFlowNode for unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | list | diff --git a/python/ql/test/query-tests/Expressions/general/expressions_test.py b/python/ql/test/query-tests/Expressions/general/expressions_test.py index 5e07b58e2041..90a3c6357481 100644 --- a/python/ql/test/query-tests/Expressions/general/expressions_test.py +++ b/python/ql/test/query-tests/Expressions/general/expressions_test.py @@ -279,3 +279,12 @@ def local(): def apply(f): pass apply(foo)([1]) + +# NoEqClass is hashable (no __eq__ => inherits __hash__ from object). +class NoEqClass: + def __init__(self, x): + self.x = x + +def hash_no_eq(): + obj = NoEqClass(1) + return hash(obj) # OK: no __eq__, so __hash__ is inherited from object