cypres/smslen.cc

## smslen.cc
#include <cassert>    // cassert
#include <cmath>      // ceil
#include <iostream>   // cout
#include <cstring>    // memset, memcpy

// Given a UTF-8 encoded string, calculate the length of the resulting GSM
// 03.38 converted string.
// It assumes the input is valid UTF-8 and UTF-8 chars that can not be
// represented will be replaced with a single ? char.
// High optimized code, runs best with clang and optimizations (-O) enabled.
size_t SmsLength(const char *str) {
  size_t l = 0;
  signed char c;
  for (; ; ++l) {
    c = *str;
    if (c == '\0') break;

    if (__builtin_expect(c > 0, 1)) {
      // This is ASCII
      ++str;
      switch (c) {  // Check if this a special char in GSM 03.38 context?
        default: break;
        case '\'': case '{': case '}': case '~':
        case '[': case ']': case '\\': case '|':
          ++l;  // Add an extra since these chars will be escaped
          break;
      }
    } else {
      // The only UTF-8 char that needs to be escaped for GSM 03.38 is €
      // All others that are not supported will be replaced with a single ?
      if (__builtin_expect(c == '\xE2', 0) &&
          (*(str+1)) == '\x82' &&
          (*(str+2)) == '\xAC') {
              ++l;
      }
      // Skip ahead based on the type of UTF-8 escape sequence
      switch (0xF0 & c) {
        case 0xE0:
          str += 3;
          break;
        case 0xF0:
          str += 4;
          break;
        default:
          str += 2;
          break;
      }
    }
  }
  return l;
}

// Get the number of SMSes an UTF-8 string will be split into
// Assumes
//  - The string will be sent using GSM 03.38 encoding
//  - The SMSC will use 8-bit reference numbers, leaving 153 chars pr. SMS
// If message contains unicode chars outside GSM 03.38 it should be sent as
// UCS-2, with lower amounts of chars for each split, see comments below.
int SmsSplits(const char *str) {
  size_t length = SmsLength(str);
  size_t chars_per_sms = 153;  // 152 for 16-bit reference numbers, 66 for UCS-2
  size_t chars_single_sms = 160;  // 70 for UCS-2
  if (length <= chars_single_sms) {
    return 1;
  } else {
    return ceil(static_cast<double>(length)/chars_per_sms);
  }
}


// Compile with: `clang++ -Ofast -g smslen.cc -o smslen`
// Run `time ./smslen`
int main() {
  const char t[] = "Hej v€rden. Hvordan har du det i dag? "
    "Jeg glæder mig til at se dig.";

  // Benchmark length calculation
  for (int i = 0; i < 10000000; i++) {
    assert(SmsLength(t) == 68);
  }

  // Try some unicode
  assert(SmsLength("~€") == 4);
  assert(SmsLength("こんにちは") == 5);  // would be sent as ????? with GSM

  // Various test cases for SmsSplits()
  const char s160[] = "Lorem ipsum dolor sit amet, consectetur adipiscing "
    "elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. Nam ornare "
    "augue vel nisi tempus, id cras amet.";
  int split160 = SmsSplits(s160);
  std::cout << "SMS with 160 bytes of lipsum: " << split160 << std::endl;
  assert(split160 == 1);

  char s161[160];
  memcpy(s161, s160, 160);
  s161[159] = '~';  // ~ is two GSM 03.38 chars, so the result will be 161
  int split161 = SmsSplits(s161);
  std::cout << "SMS with 161 bytes of lipsum: " << split161 << std::endl;
  assert(split161 == 2);

  char s305[306] = { 0 };
  memset(s305, 'a', 305);
  int split305 = SmsSplits(s305);
  std::cout << "SMS with 305 'a's: " << split305 << std::endl;
  assert(split305 == 2);


  return 0;
}

## smslen.php
<?php

function countGsm0338Length($utf8String)
{
  $len = mb_strlen($utf8String,'utf-8');
  $len += preg_match_all('/[\\^{}\\\~€|\\[\\]]/mu',$utf8String,$m);
  return $len;
}

function countSmsSplits($message)
{
  $len = countGsm0338Length($message);
  $chars_per_sms = 153;  // 152 for 16-bit reference numbers, 66 for UCS-2
  $chars_single_sms = 160;  // 70 for UCS-2
  if ($len <= $chars_single_sms) {
    return 1;
  } else {
    return ceil($len/$chars_per_sms);
  }
}

$t = "Hej v€rden. Hvordan har du det i dag? Jeg glæder mig til at se dig.";
assert(countGsm0338Length($t) == 68);
assert(countSmsSplits($t) == 1);


$s160 = "Lorem ipsum dolor sit amet, consectetur adipiscing ";
$s160 .= "elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. ";
$s160 .= "Nam ornare augue vel nisi tempus, id cras amet.";
assert(countSmsSplits($s160) == 1);

$s160[160] = '~';
assert(countSmsSplits($s160) == 2);

$s305 = str_repeat('Lorem', 61);
assert(countSmsSplits($s305) == 2);

## smslen.py
# coding=utf8
from __future__ import division
import re
from math import ceil

def smslen(message):
    if not isinstance(message, unicode):
        raise Exception('Need unicode aware strings to get sms len')
    escaped = re.findall(ur'[\^{}\~€|\[\]]', message, flags=re.M+re.U)
    return len(message) + len(escaped)

def smssplit(message):
    chars_per_sms = 153;     # 152 for 16-bit reference numbers, 66 for UCS-2
    chars_single_sms = 160;  # 70 for UCS-2
    length = smslen(message)
    if length <= chars_single_sms:
        return 1
    else:
        return ceil(length / chars_per_sms)

t = unicode("Hej v€rden. Hvordan har du det i dag? "
            "Jeg glæder mig til at se dig.", 'utf8')
assert(smslen(t) == 68)
assert(smslen(u"こんにちは") == 5)

s160 = (u"Lorem ipsum dolor sit amet, consectetur adipiscing "
"elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. Nam ornare "
"augue vel nisi tempus, id cras amet.")

assert(smssplit(s160) == 1)

s161 = s160[:159] + '~'
assert(smssplit(s161) == 2)

s305 = u"Lorem" * 61
assert(smssplit(s305) == 2)

## smssplits.txt
function CountSmsSplits(string message):
  length := number of UTF-8 decoded characters in message;  // unicode aware, ie. mb_strlen
  // Add number of times these specials chars occurs in message: '^{}~€[]\|', ie. with regex.
  length := length + count of regular expression matches("/[\\^{}\\\~€|\\[\\]]/mu", string)
  // Calculate number of splits
  charsPerMessageInChain := 153  // 152 for 16-bit reference numbers, 66 for UCS-2
  charsSingleMessage     := 160  // 70 for UCS-2
  if length <= charsSingleMessage:
    return 1
  else:
    return ceiling(length / charsPerMessageInChain)
  end if
end function
	#include <cassert> // cassert
	#include <cmath> // ceil
	#include <iostream> // cout
	#include <cstring> // memset, memcpy

	// Given a UTF-8 encoded string, calculate the length of the resulting GSM
	// 03.38 converted string.
	// It assumes the input is valid UTF-8 and UTF-8 chars that can not be
	// represented will be replaced with a single ? char.
	// High optimized code, runs best with clang and optimizations (-O) enabled.
	size_t SmsLength(const char *str) {
	size_t l = 0;
	signed char c;
	for (; ; ++l) {
	c = *str;
	if (c == '\0') break;

	if (__builtin_expect(c > 0, 1)) {
	// This is ASCII
	++str;
	switch (c) { // Check if this a special char in GSM 03.38 context?
	default: break;
	case '\'': case '{': case '}': case '~':
	case '[': case ']': case '\\': case '\|':
	++l; // Add an extra since these chars will be escaped
	break;
	}
	} else {
	// The only UTF-8 char that needs to be escaped for GSM 03.38 is €
	// All others that are not supported will be replaced with a single ?
	if (__builtin_expect(c == '\xE2', 0) &&
	(*(str+1)) == '\x82' &&
	(*(str+2)) == '\xAC') {
	++l;
	}
	// Skip ahead based on the type of UTF-8 escape sequence
	switch (0xF0 & c) {
	case 0xE0:
	str += 3;
	break;
	case 0xF0:
	str += 4;
	break;
	default:
	str += 2;
	break;
	}
	}
	}
	return l;
	}

	// Get the number of SMSes an UTF-8 string will be split into
	// Assumes
	// - The string will be sent using GSM 03.38 encoding
	// - The SMSC will use 8-bit reference numbers, leaving 153 chars pr. SMS
	// If message contains unicode chars outside GSM 03.38 it should be sent as
	// UCS-2, with lower amounts of chars for each split, see comments below.
	int SmsSplits(const char *str) {
	size_t length = SmsLength(str);
	size_t chars_per_sms = 153; // 152 for 16-bit reference numbers, 66 for UCS-2
	size_t chars_single_sms = 160; // 70 for UCS-2
	if (length <= chars_single_sms) {
	return 1;
	} else {
	return ceil(static_cast<double>(length)/chars_per_sms);
	}
	}


	// Compile with: `clang++ -Ofast -g smslen.cc -o smslen`
	// Run `time ./smslen`
	int main() {
	const char t[] = "Hej v€rden. Hvordan har du det i dag? "
	"Jeg glæder mig til at se dig.";

	// Benchmark length calculation
	for (int i = 0; i < 10000000; i++) {
	assert(SmsLength(t) == 68);
	}

	// Try some unicode
	assert(SmsLength("~€") == 4);
	assert(SmsLength("こんにちは") == 5); // would be sent as ????? with GSM

	// Various test cases for SmsSplits()
	const char s160[] = "Lorem ipsum dolor sit amet, consectetur adipiscing "
	"elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. Nam ornare "
	"augue vel nisi tempus, id cras amet.";
	int split160 = SmsSplits(s160);
	std::cout << "SMS with 160 bytes of lipsum: " << split160 << std::endl;
	assert(split160 == 1);

	char s161[160];
	memcpy(s161, s160, 160);
	s161[159] = '~'; // ~ is two GSM 03.38 chars, so the result will be 161
	int split161 = SmsSplits(s161);
	std::cout << "SMS with 161 bytes of lipsum: " << split161 << std::endl;
	assert(split161 == 2);

	char s305[306] = { 0 };
	memset(s305, 'a', 305);
	int split305 = SmsSplits(s305);
	std::cout << "SMS with 305 'a's: " << split305 << std::endl;
	assert(split305 == 2);


	return 0;
	}
	<?php

	function countGsm0338Length($utf8String)
	{
	$len = mb_strlen($utf8String,'utf-8');
	$len += preg_match_all('/[\\^{}\\\~€\|\\[\\]]/mu',$utf8String,$m);
	return $len;
	}

	function countSmsSplits($message)
	{
	$len = countGsm0338Length($message);
	$chars_per_sms = 153; // 152 for 16-bit reference numbers, 66 for UCS-2
	$chars_single_sms = 160; // 70 for UCS-2
	if ($len <= $chars_single_sms) {
	return 1;
	} else {
	return ceil($len/$chars_per_sms);
	}
	}

	$t = "Hej v€rden. Hvordan har du det i dag? Jeg glæder mig til at se dig.";
	assert(countGsm0338Length($t) == 68);
	assert(countSmsSplits($t) == 1);


	$s160 = "Lorem ipsum dolor sit amet, consectetur adipiscing ";
	$s160 .= "elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. ";
	$s160 .= "Nam ornare augue vel nisi tempus, id cras amet.";
	assert(countSmsSplits($s160) == 1);

	$s160[160] = '~';
	assert(countSmsSplits($s160) == 2);

	$s305 = str_repeat('Lorem', 61);
	assert(countSmsSplits($s305) == 2);
	# coding=utf8
	from __future__ import division
	import re
	from math import ceil

	def smslen(message):
	if not isinstance(message, unicode):
	raise Exception('Need unicode aware strings to get sms len')
	escaped = re.findall(ur'[\^{}\~€\|\[\]]', message, flags=re.M+re.U)
	return len(message) + len(escaped)

	def smssplit(message):
	chars_per_sms = 153; # 152 for 16-bit reference numbers, 66 for UCS-2
	chars_single_sms = 160; # 70 for UCS-2
	length = smslen(message)
	if length <= chars_single_sms:
	return 1
	else:
	return ceil(length / chars_per_sms)

	t = unicode("Hej v€rden. Hvordan har du det i dag? "
	"Jeg glæder mig til at se dig.", 'utf8')
	assert(smslen(t) == 68)
	assert(smslen(u"こんにちは") == 5)

	s160 = (u"Lorem ipsum dolor sit amet, consectetur adipiscing "
	"elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. Nam ornare "
	"augue vel nisi tempus, id cras amet.")

	assert(smssplit(s160) == 1)

	s161 = s160[:159] + '~'
	assert(smssplit(s161) == 2)

	s305 = u"Lorem" * 61
	assert(smssplit(s305) == 2)
	function CountSmsSplits(string message):
	length := number of UTF-8 decoded characters in message; // unicode aware, ie. mb_strlen
	// Add number of times these specials chars occurs in message: '^{}~€[]\\|', ie. with regex.
	length := length + count of regular expression matches("/[\\^{}\\\~€\|\\[\\]]/mu", string)
	// Calculate number of splits
	charsPerMessageInChain := 153 // 152 for 16-bit reference numbers, 66 for UCS-2
	charsSingleMessage := 160 // 70 for UCS-2
	if length <= charsSingleMessage:
	return 1
	else:
	return ceiling(length / charsPerMessageInChain)
	end if
	end function