From b508851008e47df01357a09620429215549c5aa3 Mon Sep 17 00:00:00 2001 From: Bensong Liu <bensl@microsoft.com> Date: Thu, 11 Mar 2021 17:50:21 +0800 Subject: [PATCH] d --- .gitignore | 2 +- Makefile | 6 ++- README.md | 61 +++++++++++------------- benchmark.sh | 9 ---- changpu_v2.cc | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 159 insertions(+), 46 deletions(-) create mode 100644 changpu_v2.cc diff --git a/.gitignore b/.gitignore index ac80737..26f0ee0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ *.c changpu_v2 - +.ccls-cache diff --git a/Makefile b/Makefile index c44172d..ab4ac44 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,14 @@ CC ?= gcc -O3 +CXX ?= g++ -O3 -build: +build_cython: cython --embed changpu_v2.py $(CC) changpu_v2.c -o changpu_v2 $$(python3-config --cflags --ldflags --embed) +build_cxx: + $(CXX) changpu_v2.cc -o changpu_v2 -std=c++17 + clean: rm -f *.c changpu changpu_v2 diff --git a/README.md b/README.md index cc2dbaa..815e0e9 100644 --- a/README.md +++ b/README.md @@ -5,41 +5,32 @@ ## performance ``` -CASE Q: PYTHON=0.041s, C=0.040s -CASE 42: PYTHON=0.041s, C=0.023s -CASE 5Q2: PYTHON=0.038s, C=0.022s -CASE 5A53: PYTHON=0.040s, C=0.038s -CASE 5669: PYTHON=0.023s, C=0.023s -CASE 54Q3: PYTHON=0.039s, C=0.021s -CASE A877: PYTHON=0.038s, C=0.034s -CASE KJA5: PYTHON=0.024s, C=0.021s -CASE K325: PYTHON=0.039s, C=0.039s -CASE Q6348: PYTHON=0.028s, C=0.023s -CASE AK6522: PYTHON=0.023s, C=0.040s -CASE QJ35482: PYTHON=0.041s, C=0.038s -CASE 9583JQ3A: PYTHON=0.040s, C=0.022s -CASE K98KK8239: PYTHON=0.040s, C=0.040s -CASE 45232A62A4: PYTHON=0.055s, C=0.141s -CASE A9JK95A8K98: PYTHON=0.059s, C=0.159s -CASE 437998Q7Q637: PYTHON=0.099s, C=0.186s -CASE 4K223JQ82Q6J5: PYTHON=0.268s, C=0.184s -CASE J43J744392A563: PYTHON=1.063s, C=0.162s -CASE 64322666KJK36AJ: PYTHON=2.914s, C=0.171s -CASE 8AAQ4355A626276K: PYTHON=12.286s, C=0.178s -CASE JQ68K2745574A5KA9: PYTHON=......., C=0.224s -CASE 5978339223K9J73A7J: PYTHON=......., C=0.330s -CASE 24KJ7Q7886592264278: PYTHON=......., C=0.409s -CASE 3729J6JQ6JQ63QQ75KK2: PYTHON=......., C=0.855s -CASE K3K2495AJ652J749J32K8: PYTHON=......., C=1.368s -CASE Q3K6AK9JQKKJJ79A72A474: PYTHON=......., C=2.406s -CASE 39QJJ3AK787K99655JQ85A9: PYTHON=......., C=4.878s -CASE 432J5259A6K5588JAJJ9KQ7K: PYTHON=......., C=12.215s -CASE A936Q4J7QKKK3J7K7AK377Q54: PYTHON=......., C=13.480s -CASE Q328K7A284Q698649652A58936: PYTHON=......., C=22.705s -CASE 327965J2Q62Q2K4J3247JQK5A4K: PYTHON=......., C=14.555s -CASE 764933Q4577J9J9374KK7J9A4Q69: PYTHON=......., C=14.666s -CASE J48J2JAK84KJ4426K4JQ5655749QK: PYTHON=......., C=15.975s -CASE QQ8AK748K2K6824557K98Q35K554J8: PYTHON=......., C=15.039s +CASE Q: PYTHON=0.040s, C=0.002s +CASE 42: PYTHON=0.037s, C=0.002s +CASE 5Q2: PYTHON=0.030s, C=0.002s +CASE 5A53: PYTHON=0.038s, C=0.004s +CASE 5669: PYTHON=0.026s, C=0.003s +CASE 54Q3: PYTHON=0.039s, C=0.004s +CASE A877: PYTHON=0.038s, C=0.004s +CASE KJA5: PYTHON=0.038s, C=0.003s +CASE K325: PYTHON=0.041s, C=0.002s +CASE Q6348: PYTHON=0.024s, C=0.003s +CASE AK6522: PYTHON=0.038s, C=0.003s +CASE QJ35482: PYTHON=0.025s, C=0.002s +CASE 9583JQ3A: PYTHON=0.037s, C=0.003s +CASE K98KK8239: PYTHON=0.028s, C=0.004s +CASE 45232A62A4: PYTHON=0.040s, C=0.004s +CASE A9JK95A8K98: PYTHON=0.048s, C=0.004s +CASE 437998Q7Q637: PYTHON=0.110s, C=0.004s +CASE 4K223JQ82Q6J5: PYTHON=0.266s, C=0.007s +CASE J43J744392A563: PYTHON=1.044s, C=0.006s +CASE 64322666KJK36AJ: PYTHON=2.963s, C=0.013s +CASE 8AAQ4355A626276K: PYTHON=11.907s, C=0.067s +CASE JQ68K2745574A5KA9: PYTHON=......., C=0.165s +CASE 5978339223K9J73A7J: PYTHON=......., C=0.572s +CASE 24KJ7Q7886592264278: PYTHON=......., C=2.037s +CASE 3729J6JQ6JQ63QQ75KK2: PYTHON=......., C=6.233s +CASE K3K2495AJ652J749J32K8: PYTHON=......., C=27.023s ``` diff --git a/benchmark.sh b/benchmark.sh index cded64c..7e50405 100644 --- a/benchmark.sh +++ b/benchmark.sh @@ -40,15 +40,6 @@ for_case_c 5978339223K9J73A7J for_case_c 24KJ7Q7886592264278 for_case_c 3729J6JQ6JQ63QQ75KK2 for_case_c K3K2495AJ652J749J32K8 -for_case_c Q3K6AK9JQKKJJ79A72A474 -for_case_c 39QJJ3AK787K99655JQ85A9 -for_case_c 432J5259A6K5588JAJJ9KQ7K -for_case_c A936Q4J7QKKK3J7K7AK377Q54 -for_case_c Q328K7A284Q698649652A58936 -for_case_c 327965J2Q62Q2K4J3247JQK5A4K -for_case_c 764933Q4577J9J9374KK7J9A4Q69 -for_case_c J48J2JAK84KJ4426K4JQ5655749QK -for_case_c QQ8AK748K2K6824557K98Q35K554J8 diff --git a/changpu_v2.cc b/changpu_v2.cc new file mode 100644 index 0000000..e04ae34 --- /dev/null +++ b/changpu_v2.cc @@ -0,0 +1,127 @@ + +#include <algorithm> +#include <rlib/stdio.hpp> +#include <rlib/opt.hpp> +#include <string> +#include <string_view> +#include <unordered_map> +#include <cassert> +using std::string; +using namespace std::literals; + +// argument should be valid. +inline char basic_int_to_char(uint8_t i) { + constexpr auto ls = "0A23456789JQK"; + return ls[i]; +} +inline uint8_t basic_char_to_int(char c) { + constexpr auto ls {"0A23456789JQK"sv}; + return ls.find(c); +} + + + +int main(int argc, char **argv) { + rlib::opt_parser args(argc, argv); + auto input = args.getSubCommand(); + assert(input.size() <= 64); // we're using uint64 to represent combination. + + std::for_each(input.begin(), input.end(), [](auto &c) { + c = basic_char_to_int(c); + }); + // Now input is a number array. + + // map sum_val to [combinations...] + std::unordered_map<uint64_t, std::vector<uint64_t>> sum_val_map; + std::vector<uint64_t> comb_sum_val_cache(1<<input.size(), 0); + auto insert_pair_to_sum_val_map = [&sum_val_map](uint64_t k, uint64_t v) { + // rlib::println("DEBUG: map_insert", k, v); + auto iter = sum_val_map.find(k); + if(iter == sum_val_map.end()) { + sum_val_map.try_emplace(k, 1, v); + } + else { + iter->second.emplace_back(v); + } + }; + + // iterate all combinations, skip all-zero and all-one, because it's not useful. + // lowest-bit means input[0], highest-bit means input[-1]. + for(uint64_t comb = 1; comb < 1<<input.size(); ++comb) { + if((comb & 1) == 1) { + // For odd number, no need to fill cache. + // rlib::printfln("DEBUG: GET cache[{}]={}", comb-1, comb_sum_val_cache[comb-1]); + const auto sum_val = comb_sum_val_cache[comb-1] + input[0]; + insert_pair_to_sum_val_map(sum_val, comb); + } + else { + // For even number, just remove any true bit, and lookup cache. + // Recommend to remove highest true bit, to make use of L3 cache. + const auto leading_zeros = __builtin_clzll(comb); + const auto highest_significant_bit_input_index = 63 - leading_zeros; + + // 00100110 ==> sum_val(00100000) + sum_val(00000110) + const auto cache_slot_left = 1ull << (highest_significant_bit_input_index); + const auto cache_slot_right = comb ^ cache_slot_left; + const uint64_t sum_val = cache_slot_right == 0 + // In this case, cache_slot_left is not cached at all. We read and cache it. + ? input[highest_significant_bit_input_index] + // In this case, we use two cache. + : comb_sum_val_cache[cache_slot_left] + comb_sum_val_cache[cache_slot_right]; + + // rlib::printfln("DEBUG: SET cache[{}]={}", comb, sum_val); + comb_sum_val_cache[comb] = sum_val; + insert_pair_to_sum_val_map(sum_val, comb); + } + } + + // Good, we have filled sum_val_map. Now compose answers. + std::vector<std::pair<uint64_t, uint64_t>> answers; + for(const auto &kv : sum_val_map) { + const auto &combinations = kv.second; + if(combinations.size() == 1) continue; + // Find all eligible pairs. + for(auto i = 0; i < combinations.size(); ++i) { + for(auto j = i+1; j < combinations.size(); ++j) { + if((combinations[i] & combinations[j]) == 0) + answers.emplace_back(combinations[i], combinations[j]); + } + } + } + + // We have got the answers! now filter answers to get top good answers. + int max_cards = 0; + decltype(answers) winned_answers; + for(const auto &answer : answers) { + const auto cards = __builtin_popcountll(answer.first) + __builtin_popcountll(answer.second); + max_cards = std::max(cards, max_cards); + } + for(const auto &answer : answers) { + const auto cards = __builtin_popcountll(answer.first) + __builtin_popcountll(answer.second); + if(cards == max_cards) + winned_answers.emplace_back(answer); + } + // still too many... only leave 5 winned answer. + if(winned_answers.size() > 5) + winned_answers.erase(winned_answers.begin() + 5, winned_answers.end()); + + + // Awesome. Print winners out! + auto naive_answer_to_string = [&](const uint64_t &bits) -> auto { + string res; + for(auto i = 0; i < input.size(); ++i) { + if((bits & 1<<i) != 0) { + res += basic_int_to_char(input[i]); + } + } + return res; + }; + for(const auto &answer : winned_answers) { + rlib::println(naive_answer_to_string(answer.first), "===", naive_answer_to_string(answer.second)); + } + + + + + +} -- GitLab