From 682178a37068466cb83350b35bf51920b29e70fe Mon Sep 17 00:00:00 2001 From: Taus Date: Fri, 20 Feb 2026 16:33:25 +0000 Subject: [PATCH 1/2] Python: Port HashedButNoHash.ql This one is a bit more involved. Of note is the fact that it at present only uses local flow when determining the origin of some value (whereas the points-to version used global flow). It may be desirable to rewrite this query to use global data-flow, but this should be done with some care (as using "all unhashable objects" as the set of sources is somewhat iffy with respect to performance). For that reason, I'm sticking to mostly local flow (except for well behaved things like types and built-ins). --- python/ql/src/Expressions/HashedButNoHash.ql | 115 +++++++++++------- .../general/HashedButNoHash.expected | 2 +- 2 files changed, 69 insertions(+), 48 deletions(-) diff --git a/python/ql/src/Expressions/HashedButNoHash.ql b/python/ql/src/Expressions/HashedButNoHash.ql index 705806bf3605..93edc7c9c6cc 100644 --- a/python/ql/src/Expressions/HashedButNoHash.ql +++ b/python/ql/src/Expressions/HashedButNoHash.ql @@ -12,76 +12,97 @@ */ import python -private import LegacyPointsTo +import semmle.python.dataflow.new.DataFlow +private import semmle.python.dataflow.new.internal.DataFlowDispatch +private import semmle.python.ApiGraphs -/* - * This assumes that any indexing operation where the value is not a sequence or numpy array involves hashing. - * For sequences, the index must be an int, which are hashable, so we don't need to treat them specially. - * For numpy arrays, the index may be a list, which are not hashable and needs to be treated specially. +/** + * Holds if `cls` explicitly sets `__hash__` to `None`, making instances unhashable. */ - -predicate numpy_array_type(ClassValue na) { - exists(ModuleValue np | np.getName() = "numpy" or np.getName() = "numpy.core" | - na.getASuperType() = np.attr("ndarray") - ) +predicate setsHashToNone(Class cls) { + DuckTyping::getAnAttributeValue(cls, "__hash__") instanceof None } -predicate has_custom_getitem(Value v) { - v.getClass().lookup("__getitem__") instanceof PythonFunctionValue +/** + * Holds if `cls` is a user-defined class whose instances are unhashable. + * A new-style class without `__hash__` is unhashable, as is one that explicitly + * sets `__hash__ = None`. + */ +predicate isUnhashableUserClass(Class cls) { + DuckTyping::isNewStyle(cls) and + not DuckTyping::hasMethod(cls, "__hash__") and + not DuckTyping::hasUnresolvedBase(getADirectSuperclass*(cls)) or - numpy_array_type(v.getClass()) + setsHashToNone(cls) } -predicate explicitly_hashed(ControlFlowNode f) { - exists(CallNode c, GlobalVariable hash | - c.getArg(0) = f and c.getFunction().(NameNode).uses(hash) and hash.getId() = "hash" +/** + * Gets the name of a builtin type whose instances are unhashable. + */ +string getUnhashableBuiltinName() { result = ["list", "set", "dict", "bytearray"] } + +/** + * Holds if `origin` is a local source node tracking an unhashable instance that + * flows to `node`, with `clsName` describing the class for the alert. + */ +predicate isUnhashable(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) { + exists(Class c | + isUnhashableUserClass(c) and + origin = classInstanceTracker(c) and + origin.flowsTo(node) and + clsName = c.getName() ) + or + clsName = getUnhashableBuiltinName() and + origin = API::builtin(clsName).getAnInstance().asSource() and + origin.flowsTo(node) +} + +predicate explicitly_hashed(DataFlow::Node node) { + node = API::builtin("hash").getACall().getArg(0) } -predicate unhashable_subscript(ControlFlowNode f, ClassValue c, ControlFlowNode origin) { - is_unhashable(f, c, origin) and - exists(SubscriptNode sub | sub.getIndex() = f | - exists(Value custom_getitem | - sub.getObject().(ControlFlowNodeWithPointsTo).pointsTo(custom_getitem) and - not has_custom_getitem(custom_getitem) - ) +/** + * Holds if the subscript object in `sub[...]` is known to use hashing for indexing, + * i.e. it does not have a custom `__getitem__` that could accept unhashable indices. + */ +predicate subscriptUsesHashing(Subscript sub) { + DataFlow::exprNode(sub.getObject()) = + API::builtin("dict").getAnInstance().getAValueReachableFromSource() + or + exists(Class cls | + classInstanceTracker(cls) + .(DataFlow::LocalSourceNode) + .flowsTo(DataFlow::exprNode(sub.getObject())) and + not DuckTyping::hasMethod(cls, "__getitem__") ) } -predicate is_unhashable(ControlFlowNodeWithPointsTo f, ClassValue cls, ControlFlowNode origin) { - exists(Value v | f.pointsTo(v, origin) and v.getClass() = cls | - not cls.hasAttribute("__hash__") and not cls.failedInference(_) and cls.isNewStyle() - or - cls.lookup("__hash__") = Value::named("None") +predicate unhashable_subscript(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) { + exists(Subscript sub | + node = DataFlow::exprNode(sub.getIndex()) and + subscriptUsesHashing(sub) + | + isUnhashable(origin, node, clsName) ) } /** - * Holds if `f` is inside a `try` that catches `TypeError`. For example: - * - * try: - * ... f ... - * except TypeError: - * ... - * - * This predicate is used to eliminate false positive results. If `hash` - * is called on an unhashable object then a `TypeError` will be thrown. - * But this is not a bug if the code catches the `TypeError` and handles - * it. + * Holds if `e` is inside a `try` that catches `TypeError`. */ -predicate typeerror_is_caught(ControlFlowNode f) { +predicate typeerror_is_caught(Expr e) { exists(Try try | - try.getBody().contains(f.getNode()) and - try.getAHandler().getType().(ExprWithPointsTo).pointsTo(ClassValue::typeError()) + try.getBody().contains(e) and + try.getAHandler().getType() = API::builtin("TypeError").getAValueReachableFromSource().asExpr() ) } -from ControlFlowNode f, ClassValue c, ControlFlowNode origin +from DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName where - not typeerror_is_caught(f) and + not typeerror_is_caught(node.asExpr()) and ( - explicitly_hashed(f) and is_unhashable(f, c, origin) + explicitly_hashed(node) and isUnhashable(origin, node, clsName) or - unhashable_subscript(f, c, origin) + unhashable_subscript(origin, node, clsName) ) -select f.getNode(), "This $@ of $@ is unhashable.", origin, "instance", c, c.getQualifiedName() +select node, "This $@ of $@ is unhashable.", origin, "instance", origin, clsName diff --git a/python/ql/test/query-tests/Expressions/general/HashedButNoHash.expected b/python/ql/test/query-tests/Expressions/general/HashedButNoHash.expected index 9589547056f7..536a7281d2d6 100644 --- a/python/ql/test/query-tests/Expressions/general/HashedButNoHash.expected +++ b/python/ql/test/query-tests/Expressions/general/HashedButNoHash.expected @@ -1 +1 @@ -| expressions_test.py:42:20:42:25 | unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | file://:0:0:0:0 | builtin-class list | list | +| expressions_test.py:42:20:42:25 | ControlFlowNode for unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | list | From a7ba98cff779c1a64731d99c4e391679ea425a74 Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 9 Apr 2026 21:03:54 +0000 Subject: [PATCH 2/2] Python: Eliminate some FPs Two main variants: cases where `__eq__` is not defined (which inherit `__hash__` automatically), and frozen dataclasses. --- python/ql/src/Expressions/HashedButNoHash.ql | 48 +++++++++++++++++-- .../general/HashedButNoHash.expected | 2 + .../Expressions/general/HashedButNoHash.qlref | 1 + .../3/query-tests/Expressions/general/test.py | 48 +++++++++++++++++++ .../Expressions/general/expressions_test.py | 9 ++++ 5 files changed, 104 insertions(+), 4 deletions(-) create mode 100644 python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.expected create mode 100644 python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.qlref create mode 100644 python/ql/test/3/query-tests/Expressions/general/test.py diff --git a/python/ql/src/Expressions/HashedButNoHash.ql b/python/ql/src/Expressions/HashedButNoHash.ql index 93edc7c9c6cc..6f4dab12570c 100644 --- a/python/ql/src/Expressions/HashedButNoHash.ql +++ b/python/ql/src/Expressions/HashedButNoHash.ql @@ -23,17 +23,57 @@ predicate setsHashToNone(Class cls) { DuckTyping::getAnAttributeValue(cls, "__hash__") instanceof None } +/** + * Holds if `cls` has a `@dataclass` decorator with `frozen=True` or `unsafe_hash=True`, + * which generates a `__hash__` method at runtime. + */ +predicate hasDataclassHash(Class cls) { + exists(DataFlow::CallCfgNode decorator | + decorator = API::moduleImport("dataclasses").getMember("dataclass").getACall() and + decorator.asExpr() = cls.getADecorator() and + decorator.getArgByName(["frozen", "unsafe_hash"]).asExpr().(ImmutableLiteral).booleanValue() = + true + ) +} + +/** + * Holds if `cls` defines `__eq__` directly (not inherited from `object`), + * or has a `@dataclass` decorator that generates `__eq__` (the default). + */ +predicate definesEq(Class cls) { + DuckTyping::hasMethod(cls, "__eq__") + or + // @dataclass(...) call generates __eq__ unless eq=False + exists(DataFlow::CallCfgNode decorator | + decorator = API::moduleImport("dataclasses").getMember("dataclass").getACall() and + decorator.asExpr() = cls.getADecorator() and + not decorator.getArgByName("eq").asExpr().(ImmutableLiteral).booleanValue() = false + ) + or + // bare @dataclass without arguments also generates __eq__ + cls.getADecorator() = + API::moduleImport("dataclasses").getMember("dataclass").getAValueReachableFromSource().asExpr() +} + /** * Holds if `cls` is a user-defined class whose instances are unhashable. - * A new-style class without `__hash__` is unhashable, as is one that explicitly - * sets `__hash__ = None`. + * + * In Python, a class is unhashable if: + * - It explicitly sets `__hash__ = None`, or + * - It defines `__eq__` (directly or via decorator like `@dataclass`) without + * defining `__hash__`, and doesn't have a decorator that generates `__hash__`. */ predicate isUnhashableUserClass(Class cls) { + setsHashToNone(cls) + or + // In Python 3, defining __eq__ without __hash__ makes a class unhashable. + // This rule does not apply in Python 2. + major_version() = 3 and DuckTyping::isNewStyle(cls) and + definesEq(cls) and not DuckTyping::hasMethod(cls, "__hash__") and + not hasDataclassHash(cls) and not DuckTyping::hasUnresolvedBase(getADirectSuperclass*(cls)) - or - setsHashToNone(cls) } /** diff --git a/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.expected b/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.expected new file mode 100644 index 000000000000..c9354099a12c --- /dev/null +++ b/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.expected @@ -0,0 +1,2 @@ +| test.py:17:17:17:19 | ControlFlowNode for obj | This $@ of $@ is unhashable. | test.py:16:11:16:23 | ControlFlowNode for HasEqNoHash() | instance | test.py:16:11:16:23 | ControlFlowNode for HasEqNoHash() | HasEqNoHash | +| test.py:48:17:48:17 | ControlFlowNode for p | This $@ of $@ is unhashable. | test.py:47:9:47:27 | ControlFlowNode for MutableDataclass() | instance | test.py:47:9:47:27 | ControlFlowNode for MutableDataclass() | MutableDataclass | diff --git a/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.qlref b/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.qlref new file mode 100644 index 000000000000..5842132bd430 --- /dev/null +++ b/python/ql/test/3/query-tests/Expressions/general/HashedButNoHash.qlref @@ -0,0 +1 @@ +Expressions/HashedButNoHash.ql diff --git a/python/ql/test/3/query-tests/Expressions/general/test.py b/python/ql/test/3/query-tests/Expressions/general/test.py new file mode 100644 index 000000000000..3c900c19f0b7 --- /dev/null +++ b/python/ql/test/3/query-tests/Expressions/general/test.py @@ -0,0 +1,48 @@ +# Classes without __eq__ are hashable (inherit __hash__ from object) +class NoEqClass: + def __init__(self, x): + self.x = x + +def hash_no_eq(): + obj = NoEqClass(1) + return hash(obj) # OK: no __eq__, so __hash__ is inherited from object + +# Classes with __eq__ but no __hash__ are unhashable +class HasEqNoHash: + def __eq__(self, other): + return self.x == other.x + +def hash_eq_no_hash(): + obj = HasEqNoHash() + return hash(obj) # should be flagged + +# @dataclass(frozen=True) generates __hash__, so instances are hashable +from dataclasses import dataclass + +@dataclass(frozen=True) +class FrozenPoint: + x: int + y: int + +def hash_frozen_dataclass(): + p = FrozenPoint(1, 2) + return hash(p) # OK: frozen dataclass has __hash__ + +# @dataclass(unsafe_hash=True) also generates __hash__ +@dataclass(unsafe_hash=True) +class UnsafeHashPoint: + x: int + y: int + +def hash_unsafe_hash_dataclass(): + p = UnsafeHashPoint(1, 2) + return hash(p) # OK: unsafe_hash=True generates __hash__ + +# Plain @dataclass without frozen/unsafe_hash and with __eq__ is unhashable +@dataclass +class MutableDataclass: + x: int + +def hash_mutable_dataclass(): + p = MutableDataclass(1) + return hash(p) # should be flagged: @dataclass generates __eq__ but not __hash__ diff --git a/python/ql/test/query-tests/Expressions/general/expressions_test.py b/python/ql/test/query-tests/Expressions/general/expressions_test.py index 5e07b58e2041..90a3c6357481 100644 --- a/python/ql/test/query-tests/Expressions/general/expressions_test.py +++ b/python/ql/test/query-tests/Expressions/general/expressions_test.py @@ -279,3 +279,12 @@ def local(): def apply(f): pass apply(foo)([1]) + +# NoEqClass is hashable (no __eq__ => inherits __hash__ from object). +class NoEqClass: + def __init__(self, x): + self.x = x + +def hash_no_eq(): + obj = NoEqClass(1) + return hash(obj) # OK: no __eq__, so __hash__ is inherited from object