alterakey/ext.py

## ext.py
# Trueseeing extension for analyzing Beijin Olympic Winter Games 2022 Android app by alterakey
# This file is in public domain.
#
# Place this file as ~/.trueseeing2/ext/__init__.py, and invoke trueseeing on the target.
from __future__ import annotations
from typing import TYPE_CHECKING

import glob
import re
import os.path
import subprocess

from trueseeing.signature.base import Detector
from trueseeing.core.code.model import Op
from trueseeing.core.literalquery import Query
from trueseeing.core.issue import Issue
from trueseeing.core.ui import ui

if TYPE_CHECKING:
  from typing import Iterable, Dict, Any
  from trueseeing.app.shell import Signatures

def patch_signatures(sigs: Signatures) -> None:
  for cl in HTML5ClearHttp,AssetInvalidMimeType,AssetString,NativeMethods,NativeArchs:
    sigs.content[cl.option] = cl # type: ignore[misc]

class HTML5ClearHttp(Detector):
  option = 'my-html5-clear-http'
  description = 'Detects cleartext HTTP/Websocket reference (HTML5)'
  _cvss1 = 'CVSS:3.0/AV:N/AC:H/PR:N/UI:R/S:U/C:L/I:L/A:L/'
  _cvss2 = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
  _summary1 = 'Cleartext HTTP/WS reference (HTML5)'
  _summary2 = 'Possible cleartext HTTP/WS reference (HTML5)'
  _synopsis = "One or more textual assets are referring external content using cleartext http."
  _detailed_description = None

  def detect(self) -> Iterable[Issue]:
    for fn in glob.glob(os.path.join(self._context.wd, 'assets', '**'), recursive=True):
      if os.path.isfile(fn):
        with open(fn, 'rb') as f:
          is_benign = re.search(r'license|readme|changes|\.(md|rst)$', fn, re.IGNORECASE)
          content = f.read()
          if is_benign or b'\x00' in content:
            continue
          if b'http://' or b'ws://' in content:
            for m in re.finditer(rb'(http|ws)://[0-9a-zA-Z!@#$%^&*_+=/?.,;:-]+', content):
              looks_benign = bool(re.search(r'\.(png|jpe?g|gif|bin|css)$', fn, re.IGNORECASE))
              if any(re.search(rb'map|place|tile|track|number|phone', k) for k in (fn.encode('utf-8'), m.group(0))):
                looks_benign = False
              yield Issue(
                detector_id=self.option,
                confidence='tentative' if looks_benign else 'firm',
                cvss3_vector=self._cvss2 if looks_benign else self._cvss1,
                summary=self._summary2 if looks_benign else self._summary1,
                info1='{pat} ({rfn})'.format(pat=m.group(0).decode('utf-8'), rfn=os.path.relpath(fn, self._context.wd)),
                synopsis=self._synopsis,
                description=self._detailed_description,
              )

class AssetInvalidMimeType(Detector):
  option = 'my-asset-type'
  description = 'Detects MIME type disparity among assets'
  _cvss = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
  _summary = 'Asset MIME type mismatch'
  _synopsis = "One or more assets are in unexpected formats."
  _detailed_description = None

  _pat_fn_known_text = r'README|LICENSE|CHANGES|\.(md|rst|txt|html|xml|css|js|glsl|h|json|mf|cfg|ini|pub|sig|pem)$'
  _pat_mimetype_known_text = r'^text/|charset=(?!binary)'

  def detect(self) -> Iterable[Issue]:
    files = [fn for fn in glob.glob(os.path.join(self._context.wd, 'assets', '**'), recursive=True) if os.path.isfile(fn)]
    p = subprocess.run(r'xargs -0 file -I | awk "{ print \$1 \"\t\" \$2 \" \" \$3 \" \" \$4 \" \" \$5 \" \" \$6 \" \" \$7 \" \" \$8 \" \" \$9 }"', shell=True, input=b'\0'.join([f.encode('utf-8') for f in files]), stdout=subprocess.PIPE)
    for l in p.stdout.decode('utf-8').splitlines():
      fn2, mimetype = re.split(r':\t+', l, maxsplit=1)
      if re.search(self._pat_fn_known_text, fn2, re.IGNORECASE) and not re.search(self._pat_mimetype_known_text, mimetype):
        yield Issue(
          detector_id=self.option,
          confidence='firm',
          cvss3_vector=self._cvss,
          summary=self._summary,
          info1='{rfn} (expected {expected}, but it looks like {mimetype})'.format(rfn=os.path.relpath(fn2, self._context.wd), expected='text', mimetype=mimetype),
          synopsis=self._synopsis,
          description=self._detailed_description,
        )
      if not re.search(self._pat_fn_known_text, fn2, re.IGNORECASE) and re.search(self._pat_mimetype_known_text, mimetype):
        yield Issue(
          detector_id=self.option,
          confidence='firm',
          cvss3_vector=self._cvss,
          summary=self._summary,
          info1='{rfn} (expected {expected}, but it looks like {mimetype})'.format(rfn=os.path.relpath(fn2, self._context.wd), expected='binary', mimetype=mimetype),
          synopsis=self._synopsis,
          description=self._detailed_description,
        )

class AssetString(Detector):
  option = 'my-asset-string'
  description = 'Detects interesting string in assets'
  _cvss = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
  _summary = 'String match in asset (HTML5)'
  _synopsis = "The application contains assets with interesting string."
  _detailed_description = "The application contains assets matching the pattern of {pat}, in any of the following encodings: {enc}"

  _pat = r'反共|亲日|日本|无罪|(魚釣|釣魚)島|天安門|打倒|公安|co2'
  _encodings = ('UTF-8', 'EUC-CN', 'Big5', 'GB18030')

  def detect(self) -> Iterable[Issue]:
    bpats: Dict[str, re.Pattern[bytes]] = dict()
    for e in self._encodings:
      try:
        bpats[e] = re.compile(self._pat.encode(e), re.IGNORECASE)
      except UnicodeEncodeError:
        ui.warn(f'my_asset_string: encoding {e} cannot encode pattern; skipped')

    if not bpats:
      return []

    files = [fn for fn in glob.glob(os.path.join(self._context.wd, 'assets', '**'), recursive=True) if os.path.isfile(fn)]
    for fn in files:
      with open(fn, 'rb') as f:
        corpse = f.read()
        for enc, bpat in bpats.items():
          for l in corpse.splitlines():
            for m in re.finditer(bpat, l):
              yield Issue(
                detector_id=self.option,
                confidence='firm',
                cvss3_vector=self._cvss,
                summary=self._summary,
                info1='{match} [{enc}] ({rfn})'.format(rfn=os.path.relpath(fn, self._context.wd), match=l.decode(enc, errors='ignore'), enc=enc),
                synopsis=self._synopsis,
                description=self._detailed_description.format(pat=self._pat, enc=','.join(self._encodings)),
              )

class NativeMethods(Detector):
  option = 'my-native-methods'
  description = 'Detects natively defined methods'
  _cvss = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
  _summary = 'Natively defined methods'
  _synopsis = "The application uses JNI."
  _detailed_description = None

  def detect(self) -> Iterable[Issue]:
    with self._context.store() as store:
      for op in self._nativeish_methods(store.db):
        yield Issue(
          detector_id=self.option,
          confidence='firm',
          cvss3_vector=self._cvss,
          summary=self._summary,
          synopsis=self._synopsis,
          source=store.query().qualname_of(op)
        )

  def _nativeish_methods(self, c: Any) -> Iterable[Op]:
    for r in c.execute('select op_vecs.op as _0, t as _1, v as _2, op1 as _3, t1 as _4, v1 as _5, op2 as _6, t2 as _7, v2 as _8, op3 as _9, t3 as _10, v3 as _11, op4 as _12, t4 as _13, v4 as _14, op5 as _15, t5 as _16, v5 as _17, op6 as _18, t6 as _19, v6 as _20, op7 as _21, t7 as _22, v7 as _23, op8 as _24, t8 as _25, v8 as _26, op9 as _27, t9 as _28, v9 as _29 from ops_method join op_vecs on (method=ops_method.op and method=op_vecs.op) where v=:pat or v2=:pat or v3=:pat or v4=:pat or v5=:pat or v6=:pat or v7=:pat or v8=:pat or v9=:pat', dict(pat='native')):
      yield Query._op_from_row(r)

class NativeArchs(Detector):
  option = 'my-native-archs'
  description = 'Detects supported architectures'
  _cvss = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
  _summary = 'Supported architectures'
  _synopsis = "The application has native codes for some architectures."

  def detect(self) -> Iterable[Issue]:
    dirs = [fn for fn in glob.glob(os.path.join(self._context.wd, 'lib', '*')) if os.path.isdir(fn)]
    for d in dirs:
      if re.search(r'arm|x86|mips', d):
        yield Issue(
          detector_id=self.option,
          confidence='firm',
          cvss3_vector=self._cvss,
          summary=self._summary,
          info1=os.path.basename(d),
          synopsis=self._synopsis,
        )
	# Trueseeing extension for analyzing Beijin Olympic Winter Games 2022 Android app by alterakey
	# This file is in public domain.
	#
	# Place this file as ~/.trueseeing2/ext/__init__.py, and invoke trueseeing on the target.
	from __future__ import annotations
	from typing import TYPE_CHECKING

	import glob
	import re
	import os.path
	import subprocess

	from trueseeing.signature.base import Detector
	from trueseeing.core.code.model import Op
	from trueseeing.core.literalquery import Query
	from trueseeing.core.issue import Issue
	from trueseeing.core.ui import ui

	if TYPE_CHECKING:
	from typing import Iterable, Dict, Any
	from trueseeing.app.shell import Signatures

	def patch_signatures(sigs: Signatures) -> None:
	for cl in HTML5ClearHttp,AssetInvalidMimeType,AssetString,NativeMethods,NativeArchs:
	sigs.content[cl.option] = cl # type: ignore[misc]

	class HTML5ClearHttp(Detector):
	option = 'my-html5-clear-http'
	description = 'Detects cleartext HTTP/Websocket reference (HTML5)'
	_cvss1 = 'CVSS:3.0/AV:N/AC:H/PR:N/UI:R/S:U/C:L/I:L/A:L/'
	_cvss2 = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
	_summary1 = 'Cleartext HTTP/WS reference (HTML5)'
	_summary2 = 'Possible cleartext HTTP/WS reference (HTML5)'
	_synopsis = "One or more textual assets are referring external content using cleartext http."
	_detailed_description = None

	def detect(self) -> Iterable[Issue]:
	for fn in glob.glob(os.path.join(self._context.wd, 'assets', '**'), recursive=True):
	if os.path.isfile(fn):
	with open(fn, 'rb') as f:
	is_benign = re.search(r'license\|readme\|changes\|\.(md\|rst)$', fn, re.IGNORECASE)
	content = f.read()
	if is_benign or b'\x00' in content:
	continue
	if b'http://' or b'ws://' in content:
	for m in re.finditer(rb'(http\|ws)://[0-9a-zA-Z!@#$%^&*_+=/?.,;:-]+', content):
	looks_benign = bool(re.search(r'\.(png\|jpe?g\|gif\|bin\|css)$', fn, re.IGNORECASE))
	if any(re.search(rb'map\|place\|tile\|track\|number\|phone', k) for k in (fn.encode('utf-8'), m.group(0))):
	looks_benign = False
	yield Issue(
	detector_id=self.option,
	confidence='tentative' if looks_benign else 'firm',
	cvss3_vector=self._cvss2 if looks_benign else self._cvss1,
	summary=self._summary2 if looks_benign else self._summary1,
	info1='{pat} ({rfn})'.format(pat=m.group(0).decode('utf-8'), rfn=os.path.relpath(fn, self._context.wd)),
	synopsis=self._synopsis,
	description=self._detailed_description,
	)

	class AssetInvalidMimeType(Detector):
	option = 'my-asset-type'
	description = 'Detects MIME type disparity among assets'
	_cvss = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
	_summary = 'Asset MIME type mismatch'
	_synopsis = "One or more assets are in unexpected formats."
	_detailed_description = None

	_pat_fn_known_text = r'README\|LICENSE\|CHANGES\|\.(md\|rst\|txt\|html\|xml\|css\|js\|glsl\|h\|json\|mf\|cfg\|ini\|pub\|sig\|pem)$'
	_pat_mimetype_known_text = r'^text/\|charset=(?!binary)'

	def detect(self) -> Iterable[Issue]:
	files = [fn for fn in glob.glob(os.path.join(self._context.wd, 'assets', '**'), recursive=True) if os.path.isfile(fn)]
	p = subprocess.run(r'xargs -0 file -I \| awk "{ print \$1 \"\t\" \$2 \" \" \$3 \" \" \$4 \" \" \$5 \" \" \$6 \" \" \$7 \" \" \$8 \" \" \$9 }"', shell=True, input=b'\0'.join([f.encode('utf-8') for f in files]), stdout=subprocess.PIPE)
	for l in p.stdout.decode('utf-8').splitlines():
	fn2, mimetype = re.split(r':\t+', l, maxsplit=1)
	if re.search(self._pat_fn_known_text, fn2, re.IGNORECASE) and not re.search(self._pat_mimetype_known_text, mimetype):
	yield Issue(
	detector_id=self.option,
	confidence='firm',
	cvss3_vector=self._cvss,
	summary=self._summary,
	info1='{rfn} (expected {expected}, but it looks like {mimetype})'.format(rfn=os.path.relpath(fn2, self._context.wd), expected='text', mimetype=mimetype),
	synopsis=self._synopsis,
	description=self._detailed_description,
	)
	if not re.search(self._pat_fn_known_text, fn2, re.IGNORECASE) and re.search(self._pat_mimetype_known_text, mimetype):
	yield Issue(
	detector_id=self.option,
	confidence='firm',
	cvss3_vector=self._cvss,
	summary=self._summary,
	info1='{rfn} (expected {expected}, but it looks like {mimetype})'.format(rfn=os.path.relpath(fn2, self._context.wd), expected='binary', mimetype=mimetype),
	synopsis=self._synopsis,
	description=self._detailed_description,
	)

	class AssetString(Detector):
	option = 'my-asset-string'
	description = 'Detects interesting string in assets'
	_cvss = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
	_summary = 'String match in asset (HTML5)'
	_synopsis = "The application contains assets with interesting string."
	_detailed_description = "The application contains assets matching the pattern of {pat}, in any of the following encodings: {enc}"

	_pat = r'反共\|亲日\|日本\|无罪\|(魚釣\|釣魚)島\|天安門\|打倒\|公安\|co2'
	_encodings = ('UTF-8', 'EUC-CN', 'Big5', 'GB18030')

	def detect(self) -> Iterable[Issue]:
	bpats: Dict[str, re.Pattern[bytes]] = dict()
	for e in self._encodings:
	try:
	bpats[e] = re.compile(self._pat.encode(e), re.IGNORECASE)
	except UnicodeEncodeError:
	ui.warn(f'my_asset_string: encoding {e} cannot encode pattern; skipped')

	if not bpats:
	return []

	files = [fn for fn in glob.glob(os.path.join(self._context.wd, 'assets', '**'), recursive=True) if os.path.isfile(fn)]
	for fn in files:
	with open(fn, 'rb') as f:
	corpse = f.read()
	for enc, bpat in bpats.items():
	for l in corpse.splitlines():
	for m in re.finditer(bpat, l):
	yield Issue(
	detector_id=self.option,
	confidence='firm',
	cvss3_vector=self._cvss,
	summary=self._summary,
	info1='{match} [{enc}] ({rfn})'.format(rfn=os.path.relpath(fn, self._context.wd), match=l.decode(enc, errors='ignore'), enc=enc),
	synopsis=self._synopsis,
	description=self._detailed_description.format(pat=self._pat, enc=','.join(self._encodings)),
	)

	class NativeMethods(Detector):
	option = 'my-native-methods'
	description = 'Detects natively defined methods'
	_cvss = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
	_summary = 'Natively defined methods'
	_synopsis = "The application uses JNI."
	_detailed_description = None

	def detect(self) -> Iterable[Issue]:
	with self._context.store() as store:
	for op in self._nativeish_methods(store.db):
	yield Issue(
	detector_id=self.option,
	confidence='firm',
	cvss3_vector=self._cvss,
	summary=self._summary,
	synopsis=self._synopsis,
	source=store.query().qualname_of(op)
	)

	def _nativeish_methods(self, c: Any) -> Iterable[Op]:
	for r in c.execute('select op_vecs.op as _0, t as _1, v as _2, op1 as _3, t1 as _4, v1 as _5, op2 as _6, t2 as _7, v2 as _8, op3 as _9, t3 as _10, v3 as _11, op4 as _12, t4 as _13, v4 as _14, op5 as _15, t5 as _16, v5 as _17, op6 as _18, t6 as _19, v6 as _20, op7 as _21, t7 as _22, v7 as _23, op8 as _24, t8 as _25, v8 as _26, op9 as _27, t9 as _28, v9 as _29 from ops_method join op_vecs on (method=ops_method.op and method=op_vecs.op) where v=:pat or v2=:pat or v3=:pat or v4=:pat or v5=:pat or v6=:pat or v7=:pat or v8=:pat or v9=:pat', dict(pat='native')):
	yield Query._op_from_row(r)

	class NativeArchs(Detector):
	option = 'my-native-archs'
	description = 'Detects supported architectures'
	_cvss = 'CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:N/'
	_summary = 'Supported architectures'
	_synopsis = "The application has native codes for some architectures."

	def detect(self) -> Iterable[Issue]:
	dirs = [fn for fn in glob.glob(os.path.join(self._context.wd, 'lib', '*')) if os.path.isdir(fn)]
	for d in dirs:
	if re.search(r'arm\|x86\|mips', d):
	yield Issue(
	detector_id=self.option,
	confidence='firm',
	cvss3_vector=self._cvss,
	summary=self._summary,
	info1=os.path.basename(d),
	synopsis=self._synopsis,
	)