Kod źródłowy zgłoszenia nr 420299

#include <bits/stdc++.h>
#include <ext/pb_ds/assoc_container.hpp>
#include <ext/pb_ds/tree_policy.hpp>
#define pb push_back
#define mp make_pair
#define all(a) begin(a),end(a)
#define FOR(x,val,to) for(int x=(val);x<int((to));++x)
#define FORE(x,val,to) for(auto x=(val);x<=(to);++x)
#define FORR(x,arr) for(auto &x: arr)
#define FORS(x,plus,arr) for(auto x = begin(arr)+(plus); x != end(arr); ++x)
#define FORREV(x,plus,arr) for(auto x = (arr).rbegin()+(plus); x !=(arr).rend(); ++x)
#define REE(s_) {cout<<s_<<'\n';exit(0);}
#define GET(arr) for(auto &i: (arr)) sc(i)
#define whatis(x) cerr << #x << " is " << (x) << endl;
#define e1 first
#define e2 second
#define INF 0x7f7f7f7f
typedef std::pair<int,int> pi;
typedef std::vector<int> vi;
typedef std::vector<std::string> vs;
typedef int64_t ll;
typedef uint64_t ull;
#define umap unordered_map
#define uset unordered_set
using namespace std;
using namespace __gnu_pbds;

#ifdef ONLINE_JUDGE
#define whatis(x) ;
#endif
#ifdef _WIN32
#define getchar_unlocked() _getchar_nolock()
#define _CRT_DISABLE_PERFCRIT_LOCKS
#endif
template<class L, class R> ostream& operator<<(ostream &os, map<L, R> P) { for(auto const &vv: P)os<<"("<<vv.first<<","<<vv.second<<")"; return os; }
template<class T> ostream& operator<<(ostream &os, set<T> V) { os<<"[";for(auto const &vv:V)os<<vv<<","; os<<"]"; return os; }
template<class T> ostream& operator<<(ostream &os, vector<T> V) { os<<"[";for(auto const &vv:V)os<<vv<<","; os<<"]"; return os; }
template<class L, class R> ostream& operator<<(ostream &os, pair<L, R> P) { os<<"("<<P.first<<","<<P.second<<")"; return os; }
inline int fstoi(const string &str){auto it=str.begin();bool neg=0;int num=0;if(*it=='-')neg=1;else num=*it-'0';++it;while(it<str.end()) num=num*10+(*it++-'0');if(neg)num*=-1;return num;}
inline void getch(char &x){while(x = getchar_unlocked(), x < 33){;}}
inline void getstr(string &str){str.clear(); char cur;while(cur=getchar_unlocked(),cur<33){;}while(cur>32){str+=cur;cur=getchar_unlocked();}}
template<typename T> inline bool sc(T &num){ bool neg=0; int c; num=0; while(c=getchar_unlocked(),c<33){if(c == EOF) return false;} if(c=='-'){ neg=1; c=getchar_unlocked(); } for(;c>47;c=getchar_unlocked()) num=num*10+c-48; if(neg) num*=-1; return true;}template<typename T, typename ...Args> inline void sc(T &num, Args &...args){ bool neg=0; int c; num=0; while(c=getchar_unlocked(),c<33){;} if(c=='-'){ neg=1; c=getchar_unlocked(); } for(;c>47;c=getchar_unlocked()) num=num*10+c-48; if(neg) num*=-1; sc(args...); }
template<typename T> using ordered_set = tree<T, null_type, less<T>, rb_tree_tag, tree_order_statistics_node_update>; //s.find_by_order(), s.order_of_key() <- works like lower_bound
template<typename T> using ordered_map = tree<T, int, less<T>, rb_tree_tag, tree_order_statistics_node_update>;
#define N 3000001

constexpr ll mod = 1000000007;

int main(){
    ios_base::sync_with_stdio(0);cin.tie(0);
    int n,q;
    sc(n,q);
    int a[n];
    a[0] = 1;
    FOR(i,1,n){
        sc(a[i]);
    }
    ll dsum[n]; // suma dystansów do wszystkich wierzchołków dla pełnego poddrzewa od i
    ll cnt[n]; // ilość wierzchołków dla pełnego poddrzewa (włącznie)
    cnt[n - 1] = 1; // last level -> 1 zawsze
    dsum[n - 1] = 0;
    for(int i = n-2; i >= 0; --i){
        // last level -> 1 zawsze
        /* cnt[i] = a[i] * (i + 1 < n ? cnt[i + 1] : 1) % mod; */
        cnt[i] = (1 + a[i + 1] * cnt[i + 1]) % mod;
        /* dsum[i] = (dsum[i + 1] + 1) * (cnt[i] - 1) % mod; */
        /* dsum[i] = (dsum[i + 1] + cnt[i + 1]) * (cnt[i] - 1) % mod; */
        dsum[i] = (dsum[i + 1] + cnt[i + 1]) * (a[i + 1]) % mod; // chyba valid
        /* dsum[i] = (dsum[i + 1] * (a[i + 1]) + (cnt[i] - 1)) % mod; // alternative valid */
    }
    /* ll nsum[N]; // suma odległości do all innych wierzchołków dla wierzchołka w itej warstwie */
    ll nsum[n]; // suma odległości do all innych wierzchołków dla wierzchołka w itej warstwie
    // precompute ez
    ll cntup = 0;
    ll sumup = 0;
    // no recursion needed
    constexpr ll modinv2 = 500000004;
    for(int i = 0; i < n; ++i){
        nsum[i] = (dsum[i] + sumup) % mod;
        // z all subtree poza jednym
        // -> + całości - tyle ile w tym nowym do którego idziemy - cnt tego do ktorego idziemy
        // jak i + 1 == n, to next vals don't matter really so whatever
        // ale sio mooze cos odwalic wiec lepiej wyifowaci guess
        if(i + 1 < n){
            ll nwcntup = (cntup + cnt[i] - cnt[i + 1]) % mod;
            ll nwsumup = (sumup + nwcntup + dsum[i] - dsum[i + 1] - cnt[i + 1]) % mod;
            cntup = nwcntup;
            sumup = nwsumup;
        }
        /* whatis(i) */
        /* whatis(nsum[i]) */
        // -> nice
    }
    // can be int?
    ll wynfullt[n]; // prefix jakie są odd -> po prostu zależne od odległości najbliższego w dół co ma odd a_i
    wynfullt[n-1] = 0; // dla jakiegokolwiek poddrzewa 0 zawsze bdz (bo root just and only root)
    for(int i = n-2; i >= 0; --i){
        if(a[i+1] % 2 == 0)
            wynfullt[i] = 0;
        else
            wynfullt[i] = wynfullt[i+1] + 1; // shift, dodanie 0
    }
    ll wynbez1gal[n]; // bez jednej gałęzi -> xorujemy parzystosc cnta bezposrednich childow
    wynbez1gal[n-1] = 0;
    for(int i = n-2; i >= 0; --i){
        if(a[i+1] % 2 == 1)
            wynbez1gal[i] = 0;
        else
            wynbez1gal[i] = wynfullt[i+1] + 1; // shift, dodanie 0
        // uzalezniamy od fulli normalnie obv
    }
    /* whatis(cnt[0]) */
    /* whatis(dsum[0]) */
    int qu[q][3];
    /* set<int> st[q]; // jakie odległości występują odd razy */
    /* set<int> uppre[n]; // pre od roota do lca - 1 bez jednej krawędzi all applied */
    /* set<int> uppreshft[n]; // pre od roota do lca - 1 bez jednej krawędzi all applied */
    // offlinowe let's go
    // -> to moze sluzyc jako suma prefixowa wsm bardzo dobrze, rowniez do
    // sciezek wewnetrzych do lca
    // tylko do lca unshifted, a tutaj shifted
    // zamiast:
    /* for(int x = lc - 1; x >= 0; --x){ */
    /*     apply_lr(lc - x, wynbez1gal[x] + (lc - x)); */
    /* } */
    // zaczynanie z shiftem od 1, potem shift prev pre o 1 też
    // czyli shift całości wsm
    auto apply = [](int x, set<int> &st){
        for(int i = 0; i <= x; ++i){
            if(st.count(i))
                st.erase(i);
            else
                st.insert(i);
        }
    };
    auto apply_lr = [](int l, int r, set<int> &st){
        for(int i = l; i <= r; ++i){
            if(st.count(i))
                st.erase(i);
            else
                st.insert(i);
        }
    };
    auto apply_s = [](set<int> &dest, const set<int> &src){
        // smaller to bigger:??
        FORR(i,src){
            if(dest.count(i))
                dest.erase(i);
            else
                dest.insert(i);
        }
        /* for(int i = 0; i <= x; ++i){ */
        /*     if(st.count(i)) */
        /*         st.erase(i); */
        /*     else */
        /*         st.insert(i); */
        /* } */
    };
    // XD, nie 3 miliony bruh
    /* auto apply_bs = [](set<int> &dest, const bitset<3000000> &src){ */
    /*     int ind = src._Find_first(); */
    /*     while(ind != 3000000){ */
    constexpr int bitset_sz = 300000;
    /* constexpr int bitset_sz = 3000000; */
    // wtf, jak to jest niby wolniejsze?
    auto apply_bs_bs = [](bitset<bitset_sz> &dst, const bitset<bitset_sz> &src){
        /* int ind = src._Find_first(); */
        dst ^= src;
        /* while(ind != bitset_sz){ */
        /*     if(dest.count(ind)) */
        /*         dest.erase(ind); */
        /*     else */
        /*         dest.insert(ind); */
        /*     ind = src._Find_next(ind); // first after (so exclusive) */
        /* } */
    };
    auto apply_bs = [](set<int> &dest, const bitset<bitset_sz> &src){
        int ind = src._Find_first();
        while(ind != bitset_sz){
            if(dest.count(ind))
                dest.erase(ind);
            else
                dest.insert(ind);
            ind = src._Find_next(ind); // first after (so exclusive)
        }
    };
    // wsm ma sens, bo to jest 0...cos xd
    /* auto apply_bs_pre = [](set<int> &dest, const bitset<bitset_sz> &src){ */
    /*     // ma set 0...cos */
    /*     int ind = src._Find_first(); */
    /*     while(ind != bitset_sz){ */
    /*         if(dest.count(ind)) */
    /*             dest.erase(ind); */
    /*         else */
    /*             dest.insert(ind); */
    /*         ind = src._Find_next(ind); // first after (so exclusive) */
    /*     } */
    /* }; */
    auto shft = [](int ile, set<int> &st){
        set<int> nw;
        FORR(i,st){
            nw.insert(i + ile);
        }
        st = std::move(nw);
    };
    // spoko, ale O(dużo) pamięci duhhh
    // -> na koniec, offlinowe, bez > 1 seta
    deque<array<int,3>> buf(q);
    FOR(i,0,q){
        int f,s,lc;
        sc(f,s,lc);
        --f,--s,--lc;
        buf[i][0] = f;
        buf[i][1] = s;
        buf[i][2] = lc;
    }
    bitset<bitset_sz> empty;
    bitset<bitset_sz> full;
    // wait, przeciez bitset zajmuje / 8 pamięci, a nie / 64 XDDD
    /* bitset<bitset_sz> wynbez1galhalf[150000]; */
    full.set();
    /* #define LIM 1 */
    #define LIM 15000
    bitset<300000> stt[LIM];
    while(!buf.empty()){
        q = min<int>(LIM, buf.size());
        /* set<int> stt[q]; */
        vi upprereq[n];
        vi wynfullreq[n];
        vi wynbez1req[n];
        vi upprenoshftreq[n];
        /* vi upprenoshftreqneg[n]; */ // no such thing lol
        vi inlcareq[n]; // to jest bardziej skompilowane, bo dawanie samego prefiksu nie jest wystarczające def
        FOR(i,0,q){
            stt[i].reset();
            /* if(i % 10000 == 0) */
            /*     whatis(i) */
            int f,s,lc;
            f = buf[i][0];
            s = buf[i][1];
            lc = buf[i][2];
            /* sc(f,s,lc); */
            /* --f,--s,--lc; */
            qu[i][0] = f;
            qu[i][1] = s;
            qu[i][2] = lc;
            // jednak nie, nawet na samplu jak oba są rootem, to się różnią o 1
            /* if(cnt[0] % 2 == 0){ // if goes over modulo chg that obv */
            /*     cout << (((nsum[f] + nsum[s]) / 2 - nsum[s]) % mod + mod) % mod << '\n'; */
            /* } */
            set<int> st; // niech na razie bedzie set i guess
            // sth sth linkustree
            // ew bitset
            // tmp na pałe
            bitset<bitset_sz> &bs = stt[i];
            auto apply = [&](int x){
                bs ^= full >> (bitset_sz - x - 1);
                /* for(int i = 0; i <= x; ++i){ */
                /*     if(st.count(i)) */
                /*         st.erase(i); */
                /*     else */
                /*         st.insert(i); */
                /* } */
            };
            /* auto apply_lr = [&](int l, int r){ */
            /*     for(int i = l; i <= r; ++i){ */
            /*         if(st.count(i)) */
            /*             st.erase(i); */
            /*         else */
            /*             st.insert(i); */
            /*     } */
            /* }; */
            /* auto apply = [&](int x){ */
            /*     for(int i = 0; i <= x; ++i){ */
            /*         if(st.count(i)) */
            /*             st.erase(i); */
            /*         else */
            /*             st.insert(i); */
            /*     } */
            /* }; */
            /* auto apply_lr = [&](int l, int r){ */
            /*     for(int i = l; i <= r; ++i){ */
            /*         if(st.count(i)) */
            /*             st.erase(i); */
            /*         else */
            /*             st.insert(i); */
            /*     } */
            /* }; */
            // shadowował
            /* if(lc == f || lc == s){ */
            if(f != s && (lc == f || lc == s)){
                /* int mn = min(f,s); */ // == lc
                int mx = max(f,s);
                apply(wynfullt[mx]);
                // let's do precompute tego tera
                /* for(int x = lc - 1; x >= 0; --x){ */
                /*     apply_lr(lc - x, wynbez1gal[x] + (lc - x)); */
                /* } */
                // wazny if
                if(lc){
                    /* apply_s(st, uppreshft[lc - 1]); */
                    upprereq[lc-1].push_back(i);
                }
                // w sumie, czy zamiast tego nie mogę po prostu jakoś wywalić
                // (zxorować) tą linię w górę? (1,...,lc)
                // i brać fulla roota?
                // i obv manualnie zamienić wartości na tej linii na rzeczywiste
                // -> not really, bo nie tylko linia jest afektowania
                // ale anyway, precomp tego ez mozna bedzie wstawić
                /* apply_lr(lc, wynbez1gal[0] + lc); */
                // fuck, jeszcze to
                /* for(int x = mx - 1; x > lc; --x){ */
                /*     // xorowańsko na pałe? */
                /*     // ale wait, nie moge sie iterowac for sure */
                /*     // trzymanie połowy bitsetów -> jak ma oba to ez xor; */
                /*     // inaczej na podstawie bliskiego (nastepnego) musze wyliczyc */
                /*     apply(wynbez1gal[x]); */
                /*     /1* wynbez1req[x].pb(i); // maybe that will suffixe who knows *1/ */
                /* } */
                // <lc;mx-1>
                // wa:c
                // wait, bo to jest:
                // <lc+1;mx-1>
                /* whatis(lc) */
                /* whatis(mx-1) */
                if(mx-1 > lc){
                    upprenoshftreq[mx-1].pb(i);
                    upprenoshftreq[lc].pb(i);
                }
                /* if(lc) */
                /*     upprenoshftreq[lc-1].pb(i); */
                // xor prefów -> ez
                /* wynbez1req[lc].pb(i); */
                apply(wynbez1gal[lc]); // tutaj lc ma tylko 1 wywaloną gałęź
                // -> nice, przeszło sampla
            }
            // thx Anadi, ważna różnica; f == s ale rózne lca -> tylko takie same
            // layery, rózne nody
            /* else if(f == s){ */
            else if(f == s && (lc == f || lc == s)){
                /* whatis("X") */
                // wa suddenly?
                // no tak, bo powyższy if shadowuje ten przecie

                // tera już nie shadowuje up, ale still bad res hmmm
                // zmieniłem wyliczanie dla st od czasu gdy miałem dobry res

                // no i co istotniejsze, też skalowanie x2 dodałem na dole

                // najpierw zobaczmy czy działa jak A == B
                apply(wynfullt[f]);
                // if nie jest lca wsm
                /* apply(wynfullt[s]); */
                // edge case troche, bo w lc możemy mieć 2 wywalone gałęzie
                /* for(int x = lc; */
                /* whatis(st) */
                /* for(int x = lc - 1; x >= 0; --x){ */
                /*     /1* apply(wynbez1gal[x] + (lc - x)); // ez shift for now *1/ */
                /*     // shiftowanie o 2 * dist od lca though */
                /*     /1* whatis(x) *1/ */
                /*     /1* whatis(wynbez1gal[x]) *1/ */
                /*     // nevermind, poniżej jest bad, bo to już nie powinien być prefix */
                /*     // przecie */
                /*     // -> lets add non-prefix interval to apply */
                /*     /1* apply(wynbez1gal[x] + 2 * (lc - x)); // ez shift for now *1/ */
                /*     /1* apply_lr(2 * (lc - x), wynbez1gal[x] + 2 * (lc - x)); // ez shift for now *1/ */
                /*     // wait, bo jakby wewnątrz też powinien być ten shift co drugi */
                /*     // -> myslę że wsm mogę robić shift o normalny dystans, i tylko pod */
                /*     // koniec dif *= 2 zrobić, to powinno mieć sens, bo zawsze miałbym */
                /*     // właściwie ten wewnętrzny shift / mul o 2 */
                /*     apply_lr(lc - x, wynbez1gal[x] + (lc - x)); // ez shift for now */
                /* } */
                // w koncu tez tutaj
                if(lc){
                    /* apply_s(st, uppreshft[lc - 1]); */
                    upprereq[lc-1].push_back(i);
                }
            }
            else{
                // opt much?
                if(f != s){
                    apply(wynfullt[f]);
                    apply(wynfullt[s]);
                }
                /* for(int x = lc - 1; x >= 0; --x){ */
                /*     apply_lr(lc - x, wynbez1gal[x] + (lc - x)); */
                /* } */
                // i tez tutaj
                // dif po odkomentowaniu tego :c
                if(lc){
                    /* apply_s(st, uppreshft[lc - 1]); */
                    upprereq[lc-1].push_back(i);
                }
                /* for(int x = f - 1; x > lc; --x){ */
                /*     apply(wynbez1gal[x]); */
                /* } */
                /* for(int x = s - 1; x > lc; --x){ */
                /*     apply(wynbez1gal[x]); */
                /*     // w teorii, mogę nie applyować części wspólnej od f i od s w */
                /*     // dół, tylko od max(f,s) - 1 do min(f,s) włącznie -> */
                /*     // optymalizacja i guess */
                /* } */
                // that's the opt
                // -> now let's try that opt
                /* for(int x = max(f, s) - 1; x >= min(f, s); --x){ */
                /*     apply(wynbez1gal[x]); */
                /*     /1* wynbez1req[i].pb(x); // maybe that will suffixe who knows *1/ */
                /*     /1* wynbez1req[x].pb(i); // maybe that will suffixe who knows *1/ */
                /*     // somehow slower... */
                /* } */
                // <min(f,s); max(f,s)-1>
                int lll = min(f,s);
                int rrr = max(f,s) - 1;
                /* if(> lc){ */
                if(rrr >= lll){
                    upprenoshftreq[rrr].pb(i);
                    if(lll-1 >= 0)
                        upprenoshftreq[lll-1].pb(i);
                }
                /* } */
                /* apply(wynbez1gal[lc]); // tutaj lc ma dwie wywalone gałęzie */
                /* if(a[lc + 1] > 2){ */
                // no fix though
                /* if(a[lc] > 2){ // wsm not even needed pewnie */
                // wa na samplu^^
                /*     // ale wait, co z zerem? nawet jak nie ma gałęzi, to to still */
                /*     // trzeba chyba dodać right? */
                /*     apply(wynfullt[lc]); // tutaj lc ma dwie wywalone gałęzie */
                /* } */
                /* whatis(wynfullt[lc]) */
                apply(wynfullt[lc]); // tutaj lc ma dwie wywalone gałęzie
                // bez wa na samplu^^
                // -> w sumie to samo, co w pełnym drzewie po prostu?
                // no bo wtedy parzystość next a_i zostaje taka sama co nie?
                // jedynie, hard edge case jak a_i(+1) == 2, bo wtedy nie ma nic
                // -> wyifować // to in that case

                // -> diff on first try sadly
                // albo i nie, wait, źle jest dla f == s tera (b4 było dobrze); dla
                // tego nowego jest dobrze
                // no tak, bo if dla lca == f || lca == s shadowował ten dla f == s
            }
            /* stt[i] = st; */
            /* whatis(st) */
        }
            /* exit(0);; */
        /* set<int> uppreshft; */
        /* for(int i = 0; i < n; ++i){ */
        /*     apply(wynbez1gal[i], uppreshft); */
        /*     shft(1, uppreshft); */
        /*     FORR(x,upprereq[i]){ */
        /*         apply_s(stt[x], uppreshft); */
        /*     } */
        /* } */
        // XDDDDDD
        // trzymałem 3milionowy zamiast 300000
        /* bitset<3000000> uppreshft; */
        /* bitset<3000000> crwynbez1gal; // zawsze prefix 1 */
        /* bitset<3000000> crwynfull; // zawsze prefix 1 */
        bitset<bitset_sz> uppreshft;
        bitset<bitset_sz> upprenoshft;
        bitset<bitset_sz> crwynbez1gal; // zawsze prefix 1
        bitset<bitset_sz> crwynfull; // zawsze prefix 1
        // jak crwynbez1gal od góry szybko???
        // wsm tu tylko zalezne od wynfullt
        // ale nie moge trzymac ich all :c
        /* wynbez1gal[n-1] = 0; */
        /* for(int i = n-2; i >= 0; --i){ */
        /*     if(a[i+1] % 2 == 1) */
        /*         wynbez1gal[i] = 0; */
        /*     else */
        /*         wynbez1gal[i] = wynfullt[i+1] + 1; // shift, dodanie 0 */
        /*     // uzalezniamy od fulli normalnie obv */
        /* } */
        int crcnt = 0; // ile 1
        for(int i = 0; i < n; ++i){
            if(i+1 == n || a[i+1] % 2 == 1){
                if(crcnt > 500){
                    crwynbez1gal.reset();
                    crwynbez1gal[0] = 1;
                }
                else{
                    for(int i = 0; i < crcnt; ++i){
                        crwynbez1gal[i] = 0;
                    }
                    crwynbez1gal[0] = 1;
                }
                crcnt = 1;
            }
            else{
                // lol, wolniej
                // ale wyliczam tez to inne cos i guess
                // na razie to zostawmy, to nie jest bottleneck def
                if(crcnt > 500){
                    crwynbez1gal &= empty;
                }
                else{
                    for(int i = 0; i < crcnt; ++i){
                        crwynbez1gal[i] = 0;
                    }
                }
                if(wynfullt[i+1] > 500){
                    crwynbez1gal |= full >> (bitset_sz - wynfullt[i+1] - 2);
                }
                else{
                    crcnt = wynfullt[i+1] + 2;
                    for(int i = 0; i < crcnt; ++i){
                        crwynbez1gal[i] = 1;
                    }
                }
                // bo to powinno być + 2 lol
                /* crcnt = wynfullt[i+1] + 1; */
                crcnt = wynfullt[i+1] + 2;
            }
            upprenoshft ^= crwynbez1gal; // i tyle, bez shiftu, cała różnica
            uppreshft ^= crwynbez1gal;
            uppreshft <<= 1;
            FORR(x,upprereq[i]){
                /* apply_bs(stt[x], uppreshft); */
                apply_bs_bs(stt[x], uppreshft);
            }
            FORR(x,upprenoshftreq[i]){
                /* apply_bs(stt[x], upprenoshft); */
                apply_bs_bs(stt[x], upprenoshft);
            }
            /* FORR(x,wynbez1req[i]){ */
            /*     apply_bs(stt[x], crwynbez1gal); */
            /* } */
        }
        FOR(i,0,q){
            /* if(i % 10000 == 0) */
            /*     whatis(i) */
            int f = qu[i][0];
            int s = qu[i][1];
            int lc = qu[i][2];
            ll dif = 0; // dif od (sa + sb) / 2
            bool kto = 0; // 0 -> A weznie; 1 -> B weznie; xorowane co take oddowego elementu
            ll distfs = f + s - 2 * lc;
            /* auto &st = stt[i]; */
            vi st;
            auto &bs = stt[i];
            int ind = bs._Find_first();
            while(ind != bitset_sz){
                st.pb(ind);
                ind = bs._Find_next(ind); // first after (so exclusive)
            }
            for(auto it = st.rbegin(); it != st.rend(); ++it){
                /* if(kto){ */
                if(!kto){ // 0 -> A weznie; had it opposite for a while
                    dif += (*it * 2 + distfs);
                    /* dif += *it; */
                }
                else{
                    /* whatis(-*it) */
                    dif -= (*it * 2 + distfs);
                    /* dif -= *it; */
                }
                kto ^= 1;
            }
            ll res = (((nsum[f] + nsum[s] + dif) % mod * modinv2 - nsum[s]) % mod + mod) % mod;
            cout << res << '\n';
        }
        FOR(i,0,q){
            buf.pop_front();
        }
    }
}

#include <bits/stdc++.h>
#include <ext/pb_ds/assoc_container.hpp>
#include <ext/pb_ds/tree_policy.hpp>
#define pb push_back
#define mp make_pair
#define all(a) begin(a),end(a)
#define FOR(x,val,to) for(int x=(val);x<int((to));++x)
#define FORE(x,val,to) for(auto x=(val);x<=(to);++x)
#define FORR(x,arr) for(auto &x: arr)
#define FORS(x,plus,arr) for(auto x = begin(arr)+(plus); x != end(arr); ++x)
#define FORREV(x,plus,arr) for(auto x = (arr).rbegin()+(plus); x !=(arr).rend(); ++x)
#define REE(s_) {cout<<s_<<'\n';exit(0);}
#define GET(arr) for(auto &i: (arr)) sc(i)
#define whatis(x) cerr << #x << " is " << (x) << endl;
#define e1 first
#define e2 second
#define INF 0x7f7f7f7f
typedef std::pair<int,int> pi;
typedef std::vector<int> vi;
typedef std::vector<std::string> vs;
typedef int64_t ll;
typedef uint64_t ull;
#define umap unordered_map
#define uset unordered_set
using namespace std;
using namespace __gnu_pbds;

#ifdef ONLINE_JUDGE
#define whatis(x) ;
#endif
#ifdef _WIN32
#define getchar_unlocked() _getchar_nolock()
#define _CRT_DISABLE_PERFCRIT_LOCKS
#endif
template<class L, class R> ostream& operator<<(ostream &os, map<L, R> P) { for(auto const &vv: P)os<<"("<<vv.first<<","<<vv.second<<")"; return os; }
template<class T> ostream& operator<<(ostream &os, set<T> V) { os<<"[";for(auto const &vv:V)os<<vv<<","; os<<"]"; return os; }
template<class T> ostream& operator<<(ostream &os, vector<T> V) { os<<"[";for(auto const &vv:V)os<<vv<<","; os<<"]"; return os; }
template<class L, class R> ostream& operator<<(ostream &os, pair<L, R> P) { os<<"("<<P.first<<","<<P.second<<")"; return os; }
inline int fstoi(const string &str){auto it=str.begin();bool neg=0;int num=0;if(*it=='-')neg=1;else num=*it-'0';++it;while(it<str.end()) num=num*10+(*it++-'0');if(neg)num*=-1;return num;}
inline void getch(char &x){while(x = getchar_unlocked(), x < 33){;}}
inline void getstr(string &str){str.clear(); char cur;while(cur=getchar_unlocked(),cur<33){;}while(cur>32){str+=cur;cur=getchar_unlocked();}}
template<typename T> inline bool sc(T &num){ bool neg=0; int c; num=0; while(c=getchar_unlocked(),c<33){if(c == EOF) return false;} if(c=='-'){ neg=1; c=getchar_unlocked(); } for(;c>47;c=getchar_unlocked()) num=num*10+c-48; if(neg) num*=-1; return true;}template<typename T, typename ...Args> inline void sc(T &num, Args &...args){ bool neg=0; int c; num=0; while(c=getchar_unlocked(),c<33){;} if(c=='-'){ neg=1; c=getchar_unlocked(); } for(;c>47;c=getchar_unlocked()) num=num*10+c-48; if(neg) num*=-1; sc(args...); }
template<typename T> using ordered_set = tree<T, null_type, less<T>, rb_tree_tag, tree_order_statistics_node_update>; //s.find_by_order(), s.order_of_key() <- works like lower_bound
template<typename T> using ordered_map = tree<T, int, less<T>, rb_tree_tag, tree_order_statistics_node_update>;
#define N 3000001

constexpr ll mod = 1000000007;

int main(){
    ios_base::sync_with_stdio(0);cin.tie(0);
    int n,q;
    sc(n,q);
    int a[n];
    a[0] = 1;
    FOR(i,1,n){
        sc(a[i]);
    }
    ll dsum[n]; // suma dystansów do wszystkich wierzchołków dla pełnego poddrzewa od i
    ll cnt[n]; // ilość wierzchołków dla pełnego poddrzewa (włącznie)
    cnt[n - 1] = 1; // last level -> 1 zawsze
    dsum[n - 1] = 0;
    for(int i = n-2; i >= 0; --i){
        // last level -> 1 zawsze
        /* cnt[i] = a[i] * (i + 1 < n ? cnt[i + 1] : 1) % mod; */
        cnt[i] = (1 + a[i + 1] * cnt[i + 1]) % mod;
        /* dsum[i] = (dsum[i + 1] + 1) * (cnt[i] - 1) % mod; */
        /* dsum[i] = (dsum[i + 1] + cnt[i + 1]) * (cnt[i] - 1) % mod; */
        dsum[i] = (dsum[i + 1] + cnt[i + 1]) * (a[i + 1]) % mod; // chyba valid
        /* dsum[i] = (dsum[i + 1] * (a[i + 1]) + (cnt[i] - 1)) % mod; // alternative valid */
    }
    /* ll nsum[N]; // suma odległości do all innych wierzchołków dla wierzchołka w itej warstwie */
    ll nsum[n]; // suma odległości do all innych wierzchołków dla wierzchołka w itej warstwie
    // precompute ez
    ll cntup = 0;
    ll sumup = 0;
    // no recursion needed
    constexpr ll modinv2 = 500000004;
    for(int i = 0; i < n; ++i){
        nsum[i] = (dsum[i] + sumup) % mod;
        // z all subtree poza jednym
        // -> + całości - tyle ile w tym nowym do którego idziemy - cnt tego do ktorego idziemy
        // jak i + 1 == n, to next vals don't matter really so whatever
        // ale sio mooze cos odwalic wiec lepiej wyifowaci guess
        if(i + 1 < n){
            ll nwcntup = (cntup + cnt[i] - cnt[i + 1]) % mod;
            ll nwsumup = (sumup + nwcntup + dsum[i] - dsum[i + 1] - cnt[i + 1]) % mod;
            cntup = nwcntup;
            sumup = nwsumup;
        }
        /* whatis(i) */
        /* whatis(nsum[i]) */
        // -> nice
    }
    // can be int?
    ll wynfullt[n]; // prefix jakie są odd -> po prostu zależne od odległości najbliższego w dół co ma odd a_i
    wynfullt[n-1] = 0; // dla jakiegokolwiek poddrzewa 0 zawsze bdz (bo root just and only root)
    for(int i = n-2; i >= 0; --i){
        if(a[i+1] % 2 == 0)
            wynfullt[i] = 0;
        else
            wynfullt[i] = wynfullt[i+1] + 1; // shift, dodanie 0
    }
    ll wynbez1gal[n]; // bez jednej gałęzi -> xorujemy parzystosc cnta bezposrednich childow
    wynbez1gal[n-1] = 0;
    for(int i = n-2; i >= 0; --i){
        if(a[i+1] % 2 == 1)
            wynbez1gal[i] = 0;
        else
            wynbez1gal[i] = wynfullt[i+1] + 1; // shift, dodanie 0
        // uzalezniamy od fulli normalnie obv
    }
    /* whatis(cnt[0]) */
    /* whatis(dsum[0]) */
    int qu[q][3];
    /* set<int> st[q]; // jakie odległości występują odd razy */
    /* set<int> uppre[n]; // pre od roota do lca - 1 bez jednej krawędzi all applied */
    /* set<int> uppreshft[n]; // pre od roota do lca - 1 bez jednej krawędzi all applied */
    // offlinowe let's go
    // -> to moze sluzyc jako suma prefixowa wsm bardzo dobrze, rowniez do
    // sciezek wewnetrzych do lca
    // tylko do lca unshifted, a tutaj shifted
    // zamiast:
    /* for(int x = lc - 1; x >= 0; --x){ */
    /*     apply_lr(lc - x, wynbez1gal[x] + (lc - x)); */
    /* } */
    // zaczynanie z shiftem od 1, potem shift prev pre o 1 też
    // czyli shift całości wsm
    auto apply = [](int x, set<int> &st){
        for(int i = 0; i <= x; ++i){
            if(st.count(i))
                st.erase(i);
            else
                st.insert(i);
        }
    };
    auto apply_lr = [](int l, int r, set<int> &st){
        for(int i = l; i <= r; ++i){
            if(st.count(i))
                st.erase(i);
            else
                st.insert(i);
        }
    };
    auto apply_s = [](set<int> &dest, const set<int> &src){
        // smaller to bigger:??
        FORR(i,src){
            if(dest.count(i))
                dest.erase(i);
            else
                dest.insert(i);
        }
        /* for(int i = 0; i <= x; ++i){ */
        /*     if(st.count(i)) */
        /*         st.erase(i); */
        /*     else */
        /*         st.insert(i); */
        /* } */
    };
    // XD, nie 3 miliony bruh
    /* auto apply_bs = [](set<int> &dest, const bitset<3000000> &src){ */
    /*     int ind = src._Find_first(); */
    /*     while(ind != 3000000){ */
    constexpr int bitset_sz = 300000;
    /* constexpr int bitset_sz = 3000000; */
    // wtf, jak to jest niby wolniejsze?
    auto apply_bs_bs = [](bitset<bitset_sz> &dst, const bitset<bitset_sz> &src){
        /* int ind = src._Find_first(); */
        dst ^= src;
        /* while(ind != bitset_sz){ */
        /*     if(dest.count(ind)) */
        /*         dest.erase(ind); */
        /*     else */
        /*         dest.insert(ind); */
        /*     ind = src._Find_next(ind); // first after (so exclusive) */
        /* } */
    };
    auto apply_bs = [](set<int> &dest, const bitset<bitset_sz> &src){
        int ind = src._Find_first();
        while(ind != bitset_sz){
            if(dest.count(ind))
                dest.erase(ind);
            else
                dest.insert(ind);
            ind = src._Find_next(ind); // first after (so exclusive)
        }
    };
    // wsm ma sens, bo to jest 0...cos xd
    /* auto apply_bs_pre = [](set<int> &dest, const bitset<bitset_sz> &src){ */
    /*     // ma set 0...cos */
    /*     int ind = src._Find_first(); */
    /*     while(ind != bitset_sz){ */
    /*         if(dest.count(ind)) */
    /*             dest.erase(ind); */
    /*         else */
    /*             dest.insert(ind); */
    /*         ind = src._Find_next(ind); // first after (so exclusive) */
    /*     } */
    /* }; */
    auto shft = [](int ile, set<int> &st){
        set<int> nw;
        FORR(i,st){
            nw.insert(i + ile);
        }
        st = std::move(nw);
    };
    // spoko, ale O(dużo) pamięci duhhh
    // -> na koniec, offlinowe, bez > 1 seta
    deque<array<int,3>> buf(q);
    FOR(i,0,q){
        int f,s,lc;
        sc(f,s,lc);
        --f,--s,--lc;
        buf[i][0] = f;
        buf[i][1] = s;
        buf[i][2] = lc;
    }
    bitset<bitset_sz> empty;
    bitset<bitset_sz> full;
    // wait, przeciez bitset zajmuje / 8 pamięci, a nie / 64 XDDD
    /* bitset<bitset_sz> wynbez1galhalf[150000]; */
    full.set();
    /* #define LIM 1 */
    #define LIM 15000
    bitset<300000> stt[LIM];
    while(!buf.empty()){
        q = min<int>(LIM, buf.size());
        /* set<int> stt[q]; */
        vi upprereq[n];
        vi wynfullreq[n];
        vi wynbez1req[n];
        vi upprenoshftreq[n];
        /* vi upprenoshftreqneg[n]; */ // no such thing lol
        vi inlcareq[n]; // to jest bardziej skompilowane, bo dawanie samego prefiksu nie jest wystarczające def
        FOR(i,0,q){
            stt[i].reset();
            /* if(i % 10000 == 0) */
            /*     whatis(i) */
            int f,s,lc;
            f = buf[i][0];
            s = buf[i][1];
            lc = buf[i][2];
            /* sc(f,s,lc); */
            /* --f,--s,--lc; */
            qu[i][0] = f;
            qu[i][1] = s;
            qu[i][2] = lc;
            // jednak nie, nawet na samplu jak oba są rootem, to się różnią o 1
            /* if(cnt[0] % 2 == 0){ // if goes over modulo chg that obv */
            /*     cout << (((nsum[f] + nsum[s]) / 2 - nsum[s]) % mod + mod) % mod << '\n'; */
            /* } */
            set<int> st; // niech na razie bedzie set i guess
            // sth sth linkustree
            // ew bitset
            // tmp na pałe
            bitset<bitset_sz> &bs = stt[i];
            auto apply = [&](int x){
                bs ^= full >> (bitset_sz - x - 1);
                /* for(int i = 0; i <= x; ++i){ */
                /*     if(st.count(i)) */
                /*         st.erase(i); */
                /*     else */
                /*         st.insert(i); */
                /* } */
            };
            /* auto apply_lr = [&](int l, int r){ */
            /*     for(int i = l; i <= r; ++i){ */
            /*         if(st.count(i)) */
            /*             st.erase(i); */
            /*         else */
            /*             st.insert(i); */
            /*     } */
            /* }; */
            /* auto apply = [&](int x){ */
            /*     for(int i = 0; i <= x; ++i){ */
            /*         if(st.count(i)) */
            /*             st.erase(i); */
            /*         else */
            /*             st.insert(i); */
            /*     } */
            /* }; */
            /* auto apply_lr = [&](int l, int r){ */
            /*     for(int i = l; i <= r; ++i){ */
            /*         if(st.count(i)) */
            /*             st.erase(i); */
            /*         else */
            /*             st.insert(i); */
            /*     } */
            /* }; */
            // shadowował
            /* if(lc == f || lc == s){ */
            if(f != s && (lc == f || lc == s)){
                /* int mn = min(f,s); */ // == lc
                int mx = max(f,s);
                apply(wynfullt[mx]);
                // let's do precompute tego tera
                /* for(int x = lc - 1; x >= 0; --x){ */
                /*     apply_lr(lc - x, wynbez1gal[x] + (lc - x)); */
                /* } */
                // wazny if
                if(lc){
                    /* apply_s(st, uppreshft[lc - 1]); */
                    upprereq[lc-1].push_back(i);
                }
                // w sumie, czy zamiast tego nie mogę po prostu jakoś wywalić
                // (zxorować) tą linię w górę? (1,...,lc)
                // i brać fulla roota?
                // i obv manualnie zamienić wartości na tej linii na rzeczywiste
                // -> not really, bo nie tylko linia jest afektowania
                // ale anyway, precomp tego ez mozna bedzie wstawić
                /* apply_lr(lc, wynbez1gal[0] + lc); */
                // fuck, jeszcze to
                /* for(int x = mx - 1; x > lc; --x){ */
                /*     // xorowańsko na pałe? */
                /*     // ale wait, nie moge sie iterowac for sure */
                /*     // trzymanie połowy bitsetów -> jak ma oba to ez xor; */
                /*     // inaczej na podstawie bliskiego (nastepnego) musze wyliczyc */
                /*     apply(wynbez1gal[x]); */
                /*     /1* wynbez1req[x].pb(i); // maybe that will suffixe who knows *1/ */
                /* } */
                // <lc;mx-1>
                // wa:c
                // wait, bo to jest:
                // <lc+1;mx-1>
                /* whatis(lc) */
                /* whatis(mx-1) */
                if(mx-1 > lc){
                    upprenoshftreq[mx-1].pb(i);
                    upprenoshftreq[lc].pb(i);
                }
                /* if(lc) */
                /*     upprenoshftreq[lc-1].pb(i); */
                // xor prefów -> ez
                /* wynbez1req[lc].pb(i); */
                apply(wynbez1gal[lc]); // tutaj lc ma tylko 1 wywaloną gałęź
                // -> nice, przeszło sampla
            }
            // thx Anadi, ważna różnica; f == s ale rózne lca -> tylko takie same
            // layery, rózne nody
            /* else if(f == s){ */
            else if(f == s && (lc == f || lc == s)){
                /* whatis("X") */
                // wa suddenly?
                // no tak, bo powyższy if shadowuje ten przecie

                // tera już nie shadowuje up, ale still bad res hmmm
                // zmieniłem wyliczanie dla st od czasu gdy miałem dobry res

                // no i co istotniejsze, też skalowanie x2 dodałem na dole

                // najpierw zobaczmy czy działa jak A == B
                apply(wynfullt[f]);
                // if nie jest lca wsm
                /* apply(wynfullt[s]); */
                // edge case troche, bo w lc możemy mieć 2 wywalone gałęzie
                /* for(int x = lc; */
                /* whatis(st) */
                /* for(int x = lc - 1; x >= 0; --x){ */
                /*     /1* apply(wynbez1gal[x] + (lc - x)); // ez shift for now *1/ */
                /*     // shiftowanie o 2 * dist od lca though */
                /*     /1* whatis(x) *1/ */
                /*     /1* whatis(wynbez1gal[x]) *1/ */
                /*     // nevermind, poniżej jest bad, bo to już nie powinien być prefix */
                /*     // przecie */
                /*     // -> lets add non-prefix interval to apply */
                /*     /1* apply(wynbez1gal[x] + 2 * (lc - x)); // ez shift for now *1/ */
                /*     /1* apply_lr(2 * (lc - x), wynbez1gal[x] + 2 * (lc - x)); // ez shift for now *1/ */
                /*     // wait, bo jakby wewnątrz też powinien być ten shift co drugi */
                /*     // -> myslę że wsm mogę robić shift o normalny dystans, i tylko pod */
                /*     // koniec dif *= 2 zrobić, to powinno mieć sens, bo zawsze miałbym */
                /*     // właściwie ten wewnętrzny shift / mul o 2 */
                /*     apply_lr(lc - x, wynbez1gal[x] + (lc - x)); // ez shift for now */
                /* } */
                // w koncu tez tutaj
                if(lc){
                    /* apply_s(st, uppreshft[lc - 1]); */
                    upprereq[lc-1].push_back(i);
                }
            }
            else{
                // opt much?
                if(f != s){
                    apply(wynfullt[f]);
                    apply(wynfullt[s]);
                }
                /* for(int x = lc - 1; x >= 0; --x){ */
                /*     apply_lr(lc - x, wynbez1gal[x] + (lc - x)); */
                /* } */
                // i tez tutaj
                // dif po odkomentowaniu tego :c
                if(lc){
                    /* apply_s(st, uppreshft[lc - 1]); */
                    upprereq[lc-1].push_back(i);
                }
                /* for(int x = f - 1; x > lc; --x){ */
                /*     apply(wynbez1gal[x]); */
                /* } */
                /* for(int x = s - 1; x > lc; --x){ */
                /*     apply(wynbez1gal[x]); */
                /*     // w teorii, mogę nie applyować części wspólnej od f i od s w */
                /*     // dół, tylko od max(f,s) - 1 do min(f,s) włącznie -> */
                /*     // optymalizacja i guess */
                /* } */
                // that's the opt
                // -> now let's try that opt
                /* for(int x = max(f, s) - 1; x >= min(f, s); --x){ */
                /*     apply(wynbez1gal[x]); */
                /*     /1* wynbez1req[i].pb(x); // maybe that will suffixe who knows *1/ */
                /*     /1* wynbez1req[x].pb(i); // maybe that will suffixe who knows *1/ */
                /*     // somehow slower... */
                /* } */
                // <min(f,s); max(f,s)-1>
                int lll = min(f,s);
                int rrr = max(f,s) - 1;
                /* if(> lc){ */
                if(rrr >= lll){
                    upprenoshftreq[rrr].pb(i);
                    if(lll-1 >= 0)
                        upprenoshftreq[lll-1].pb(i);
                }
                /* } */
                /* apply(wynbez1gal[lc]); // tutaj lc ma dwie wywalone gałęzie */
                /* if(a[lc + 1] > 2){ */
                // no fix though
                /* if(a[lc] > 2){ // wsm not even needed pewnie */
                // wa na samplu^^
                /*     // ale wait, co z zerem? nawet jak nie ma gałęzi, to to still */
                /*     // trzeba chyba dodać right? */
                /*     apply(wynfullt[lc]); // tutaj lc ma dwie wywalone gałęzie */
                /* } */
                /* whatis(wynfullt[lc]) */
                apply(wynfullt[lc]); // tutaj lc ma dwie wywalone gałęzie
                // bez wa na samplu^^
                // -> w sumie to samo, co w pełnym drzewie po prostu?
                // no bo wtedy parzystość next a_i zostaje taka sama co nie?
                // jedynie, hard edge case jak a_i(+1) == 2, bo wtedy nie ma nic
                // -> wyifować // to in that case

                // -> diff on first try sadly
                // albo i nie, wait, źle jest dla f == s tera (b4 było dobrze); dla
                // tego nowego jest dobrze
                // no tak, bo if dla lca == f || lca == s shadowował ten dla f == s
            }
            /* stt[i] = st; */
            /* whatis(st) */
        }
            /* exit(0);; */
        /* set<int> uppreshft; */
        /* for(int i = 0; i < n; ++i){ */
        /*     apply(wynbez1gal[i], uppreshft); */
        /*     shft(1, uppreshft); */
        /*     FORR(x,upprereq[i]){ */
        /*         apply_s(stt[x], uppreshft); */
        /*     } */
        /* } */
        // XDDDDDD
        // trzymałem 3milionowy zamiast 300000
        /* bitset<3000000> uppreshft; */
        /* bitset<3000000> crwynbez1gal; // zawsze prefix 1 */
        /* bitset<3000000> crwynfull; // zawsze prefix 1 */
        bitset<bitset_sz> uppreshft;
        bitset<bitset_sz> upprenoshft;
        bitset<bitset_sz> crwynbez1gal; // zawsze prefix 1
        bitset<bitset_sz> crwynfull; // zawsze prefix 1
        // jak crwynbez1gal od góry szybko???
        // wsm tu tylko zalezne od wynfullt
        // ale nie moge trzymac ich all :c
        /* wynbez1gal[n-1] = 0; */
        /* for(int i = n-2; i >= 0; --i){ */
        /*     if(a[i+1] % 2 == 1) */
        /*         wynbez1gal[i] = 0; */
        /*     else */
        /*         wynbez1gal[i] = wynfullt[i+1] + 1; // shift, dodanie 0 */
        /*     // uzalezniamy od fulli normalnie obv */
        /* } */
        int crcnt = 0; // ile 1
        for(int i = 0; i < n; ++i){
            if(i+1 == n || a[i+1] % 2 == 1){
                if(crcnt > 500){
                    crwynbez1gal.reset();
                    crwynbez1gal[0] = 1;
                }
                else{
                    for(int i = 0; i < crcnt; ++i){
                        crwynbez1gal[i] = 0;
                    }
                    crwynbez1gal[0] = 1;
                }
                crcnt = 1;
            }
            else{
                // lol, wolniej
                // ale wyliczam tez to inne cos i guess
                // na razie to zostawmy, to nie jest bottleneck def
                if(crcnt > 500){
                    crwynbez1gal &= empty;
                }
                else{
                    for(int i = 0; i < crcnt; ++i){
                        crwynbez1gal[i] = 0;
                    }
                }
                if(wynfullt[i+1] > 500){
                    crwynbez1gal |= full >> (bitset_sz - wynfullt[i+1] - 2);
                }
                else{
                    crcnt = wynfullt[i+1] + 2;
                    for(int i = 0; i < crcnt; ++i){
                        crwynbez1gal[i] = 1;
                    }
                }
                // bo to powinno być + 2 lol
                /* crcnt = wynfullt[i+1] + 1; */
                crcnt = wynfullt[i+1] + 2;
            }
            upprenoshft ^= crwynbez1gal; // i tyle, bez shiftu, cała różnica
            uppreshft ^= crwynbez1gal;
            uppreshft <<= 1;
            FORR(x,upprereq[i]){
                /* apply_bs(stt[x], uppreshft); */
                apply_bs_bs(stt[x], uppreshft);
            }
            FORR(x,upprenoshftreq[i]){
                /* apply_bs(stt[x], upprenoshft); */
                apply_bs_bs(stt[x], upprenoshft);
            }
            /* FORR(x,wynbez1req[i]){ */
            /*     apply_bs(stt[x], crwynbez1gal); */
            /* } */
        }
        FOR(i,0,q){
            /* if(i % 10000 == 0) */
            /*     whatis(i) */
            int f = qu[i][0];
            int s = qu[i][1];
            int lc = qu[i][2];
            ll dif = 0; // dif od (sa + sb) / 2
            bool kto = 0; // 0 -> A weznie; 1 -> B weznie; xorowane co take oddowego elementu
            ll distfs = f + s - 2 * lc;
            /* auto &st = stt[i]; */
            vi st;
            auto &bs = stt[i];
            int ind = bs._Find_first();
            while(ind != bitset_sz){
                st.pb(ind);
                ind = bs._Find_next(ind); // first after (so exclusive)
            }
            for(auto it = st.rbegin(); it != st.rend(); ++it){
                /* if(kto){ */
                if(!kto){ // 0 -> A weznie; had it opposite for a while
                    dif += (*it * 2 + distfs);
                    /* dif += *it; */
                }
                else{
                    /* whatis(-*it) */
                    dif -= (*it * 2 + distfs);
                    /* dif -= *it; */
                }
                kto ^= 1;
            }
            ll res = (((nsum[f] + nsum[s] + dif) % mod * modinv2 - nsum[s]) % mod + mod) % mod;
            cout << res << '\n';
        }
        FOR(i,0,q){
            buf.pop_front();
        }
    }
}